knitr::opts_chunk$set(echo = TRUE)

#empty environment before starting the script
rm(list=ls()) 

# Load necessary libraries
library(googlesheets4)
library(googledrive)
## 
## Attaching package: 'googledrive'
## The following objects are masked from 'package:googlesheets4':
## 
##     request_generate, request_make
library(RColorBrewer)    # plotting (color palettes)
library(paletteer)       # plotting (color palettes)
library(gridExtra)       # plotting (arrange multiple plots together)
library(patchwork)       # plotting (combine multiple plots)
library(corrplot)        # plotting (correlation plots)
## corrplot 0.95 loaded
library(ggpubr)          # plotting (publication-ready plots)
## Loading required package: ggplot2
library(lubridate)       # date and time data
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(hms)             # time data
## 
## Attaching package: 'hms'
## The following object is masked from 'package:lubridate':
## 
##     hms
library(Hmisc)           # used for correlation analysis
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:patchwork':
## 
##     area
library(lme4)            # GLMM (esp Poisson)
## Loading required package: Matrix
library(glmmTMB)         # GLMM (esp negbin)
library(DHARMa)          # checking GLMM assumptions
## This is DHARMa 0.4.7. For overview type '?DHARMa'. For recent changes, type news(package = 'DHARMa')
library(easystats)       # GLMM assumptions and model performance
## # Attaching packages: easystats 0.7.4 (red = needs update)
## ✔ bayestestR  0.15.2   ✔ correlation 0.8.7 
## ✖ datawizard  1.0.1    ✔ effectsize  1.0.0 
## ✔ insight     1.1.0    ✔ modelbased  0.10.0
## ✔ performance 0.13.0   ✔ parameters  0.24.2
## ✔ report      0.6.1    ✔ see         0.11.0
## 
## Restart the R-Session and update packages with `easystats::easystats_update()`.
library(broom.mixed)     # tidying mixed models results
library(ggeffects)       # plot model predictions
## 
## Attaching package: 'ggeffects'
## The following object is masked from 'package:modelbased':
## 
##     pool_predictions
## The following object is masked from 'package:easystats':
## 
##     install_latest
library(emmeans)         # EMMs (used for post hoc tests)
## Welcome to emmeans.
## Caution: You lose important information if you filter this package's results.
## See '? untidy'
library(multcompView)    # plot results of post hoc text
library(sjPlot)          # effect sizes plot
## Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!
library(car)             # anova type 2 & 3 tables
## Loading required package: carData
library(tidyverse)       # data wrangling
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr   1.1.4     ✔ stringr 1.5.1
## ✔ forcats 1.0.0     ✔ tibble  3.2.1
## ✔ purrr   1.0.2     ✔ tidyr   1.3.1
## ✔ readr   2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine()                masks gridExtra::combine()
## ✖ tidyr::expand()                 masks Matrix::expand()
## ✖ dplyr::filter()                 masks stats::filter()
## ✖ hms::hms()                      masks lubridate::hms()
## ✖ dplyr::lag()                    masks stats::lag()
## ✖ tidyr::pack()                   masks Matrix::pack()
## ✖ dplyr::recode()                 masks car::recode()
## ✖ googledrive::request_generate() masks googlesheets4::request_generate()
## ✖ googledrive::request_make()     masks googlesheets4::request_make()
## ✖ dplyr::select()                 masks MASS::select()
## ✖ purrr::some()                   masks car::some()
## ✖ dplyr::src()                    masks Hmisc::src()
## ✖ dplyr::summarize()              masks Hmisc::summarize()
## ✖ tidyr::unpack()                 masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Thesis Data Analysis

In this document I go through each of my research questions and perform all statistical analyses necessary to answer them.

Before going into each research question, I load and prepare all the necessary data.

0. Data loading and general data preparation

0.1 Loading tables from Google Drive

The three main data tables (environmental data and information on experiment start time/location/etc., species identification data, experiment data with amount of ants observed at different time points) are loaded and the variables are reformatted as needed.

# loading the google sheet containing the experiment data
exp_data <- drive_get("experiment_data_all_years") %>%
  read_sheet(, sheet = 2)
## ! Using an auto-discovered, cached token.
##   To suppress this message, modify your code or options to clearly consent to
##   the use of a cached token.
##   See gargle's "Non-interactive auth" vignette for more details:
##   <https://gargle.r-lib.org/articles/non-interactive-auth.html>
## ℹ The googledrive package is using a cached token for
##   'nelma.peclard@gmail.com'.
## Auto-refreshing stale OAuth token.
## ✔ The input `path` resolved to exactly 1 file.
## ! Using an auto-discovered, cached token.
##   To suppress this message, modify your code or options to clearly consent to
##   the use of a cached token.
##   See gargle's "Non-interactive auth" vignette for more details:
##   <https://gargle.r-lib.org/articles/non-interactive-auth.html>
## ℹ The googlesheets4 package is using a cached token for
##   'nelma.peclard@gmail.com'.
## Auto-refreshing stale OAuth token.
## ✔ Reading from "experiment_data_all_years".
## ✔ Range ''all_years''.
exp_data$count_time <- as.character(exp_data$count_time)  # turning the count_time column into strings

# loading the google sheet containing the environmental data
env_data <- drive_get("full_env_data_all_years") %>%
  read_sheet(, sheet = 2)
## ✔ The input `path` resolved to exactly 1 file.
## ✔ Reading from "full_env_data_all_years".
## ✔ Range ''all_years''.
env_data$date_time <- dmy_hm(paste0(env_data$date, sep = " ", env_data$start_time))
## Warning: 4 failed to parse.
env_data$date <- dmy(env_data$date)  # turning the date column into a date object
env_data$start_time[!is.na(env_data$start_time)] <- paste0(env_data$start_time[!is.na(env_data$start_time)], ":00") 
env_data$start_time <- as_hms(env_data$start_time)  # turning the start_time column into a hh:mm:ss time object
env_data$end_time[!is.na(env_data$end_time)] <- paste0(env_data$end_time[!is.na(env_data$end_time)], ":00") 
env_data$end_time <- as_hms(env_data$end_time)  # turning the end_time column into a hh:mm:ss time object

# loading the google sheet containing the identified species data
species_data <- drive_get("species_data_all_years") %>%
  read_sheet(, sheet = 2)
## ✔ The input `path` resolved to exactly 1 file.
## ✔ Reading from "species_data_all_years".
## ✔ Range ''all_years''.

0.2 Preparation of ant counts data

0.2.1 Ant totals

For most of the analyses, I need the total number of ants found on all baits of one experiment at the end of the experiment.

# first calculation of the total number of ants counted after 120 min, after freezing, and in the lab before identification
tmp_exp_totals <- exp_data %>%
  filter(count_time %in% c("120", "Frozen", "Lab")) %>%  # Keep only relevant time points
  group_by(sample_ID, count_time) %>%  # Group by experiment and time point
  summarise(
    total_ants = if_else(all(is.na(individuals)), NA, sum(individuals, na.rm = TRUE)),  
    .groups = "drop") %>%  
  pivot_wider(names_from = count_time, values_from = total_ants, names_prefix = "total_")  # Convert to wide format

# comparing the number of experiments where all values were NA between 120 min, after freezing, in the lab
length(tmp_exp_totals$total_120[is.na(tmp_exp_totals$total_120)]) 
## [1] 11
length(tmp_exp_totals$total_Frozen[is.na(tmp_exp_totals$total_Frozen)])
## [1] 56
length(tmp_exp_totals$total_Lab[is.na(tmp_exp_totals$total_Lab)])
## [1] 42

There are a lot more rows with missing values in the total ants counted after freezing or in the lab than at 120 minutes. This is very likely in part due to participants not reporting 0s after freezing on experiments where no ant was collected, and not sending back the empty ethanol vials.

I filtered the data table on Google Sheets and found that the number of experiments which have 0 ants at 120 minutes for all sugar concentrations but all missing values for lab counting is 13. Since participants wrote that there were 0 ants on all concentrations of the experiment at 120 minutes (the end of the experiment), I can consider that there were no ants to be sent back and counted in the lab. In those specific cases, I replace the NAs recorded in the lab by 0s. I do not modify experiments where one or more of the concentration had anything other than 0 (whether NA or positive value) ants at 120 minutes.

# extract experiments where all concentrations had 0 ants at 120 and lab has all NAs

# for each experiment, check that at count_time "120" the individuals count is 0 for every concentration.
tmp_exp_zero_120 <- exp_data %>%
  filter(count_time == "120") %>%  # select only rows at 120 minutes
  group_by(sample_ID) %>%
  summarise(
    n_obs = n(),                  # Count number of rows (should be 5 i.e. all 5 baits had no ants)
    all_zero = all(individuals == 0)     # Will be NA if any value is NA
    ) %>%
  filter(n_obs == 5, all_zero == TRUE) %>%  # Keep only experiments with 5 observations all equal to 0
  pull(sample_ID) 

# for each experiment, filter those with NA at each concentration at count_time "lab"
tmp_exp_na_lab <- exp_data %>%
  filter(count_time == "Lab") %>%
  group_by(sample_ID) %>%
  summarise(all_na = all(is.na(individuals))) %>%  # find all NA totals counted in the lab
  filter(all_na) %>%  # only keep NA totals
  pull(sample_ID)

# find experiments where both all 120 counts = 0 and all lab counts = NA
tmp_experiments_0_na <- intersect(tmp_exp_zero_120, tmp_exp_na_lab)

# create new data frame with modified values for lab counts
corr_exp_data <- exp_data %>%
  mutate(individuals = if_else(
    count_time == "Lab" & sample_ID %in% tmp_experiments_0_na & is.na(individuals),
    0,  # replace NA with 0 for lab samples from exp in the list above with all NA values
    individuals))  # otherwise, keep the original value

I can now recalculate the total number of ants per experiment.

exp_totals <- corr_exp_data %>%
  filter(count_time %in% c("120", "Frozen", "Lab")) %>%  # Keep only relevant time points
  group_by(sample_ID, count_time) %>%  # Group by experiment and time point
  summarise(
    total_ants = if_else(all(is.na(individuals)), NA, sum(individuals, na.rm = TRUE)),  # if all values are NA, keep NA, if at least 1 value is real, sum
    .groups = "drop") %>%  
  pivot_wider(names_from = count_time, values_from = total_ants, names_prefix = "total_")  # Convert to wide format

head(exp_totals)
## # A tibble: 6 × 4
##   sample_ID total_120 total_Frozen total_Lab
##   <chr>         <dbl>        <dbl>     <dbl>
## 1 22-10000          0            0         0
## 2 22-11002         31           40        40
## 3 22-11003         12           NA        NA
## 4 22-11004          3           NA         2
## 5 22-11005          0           NA         0
## 6 22-11006          6           NA         0

I can also obtain the total of ants from each experiment from the species data, which should correspond to the lab count totals. The species data only has rows for experiments which had ants, so the experiments where 0 ants were found need to be added.

The totals per concentration for each experiment can also be extracted from the species data and the lab count data (to add the values for the experiments with 0 ants).

# remove non-ants from the species data
corr_species_data <- species_data[species_data$family != "non-ant",]

## totals per experiment

# extract the total number of ants of all species per experiment
tmp_sp_totals <- corr_species_data %>%
  group_by(sample_ID) %>%  # Group by experiment
  summarise(total_ants_sp = sum(individuals),
    .groups = "drop")

nrow(tmp_sp_totals); nrow(env_data) # sp_totals has less rows because many exp had no ants
## [1] 142
## [1] 239
# intermediary check that the total of ants is the same as in the initial data frame
sum(tmp_sp_totals$total_ants_sp, na.rm = T) == sum(corr_species_data$individuals, na.rm = T)
## [1] TRUE
# Create a complete data frame with all sample_IDs
sp_totals <- expand.grid(sample_ID = unique(env_data$sample_ID))  # take sample ID column from env_data to have the full 239 experiments

# Merge complete data frame with ant totals
sp_totals <- left_join(sp_totals, tmp_sp_totals, by = c("sample_ID")) 

# from exp_totals, filter experiments where the lab total count was 0 or NA
tmp_exp_totals_0_na <- exp_totals %>%
  filter(total_Lab == 0 | is.na(total_Lab)) %>%
  dplyr::select(sample_ID, total_Lab) 

# update ant counts in species totals with exp where lab count is 0 or NA
sp_totals <- sp_totals %>%
  left_join(tmp_exp_totals_0_na, by = c("sample_ID")) %>%
  mutate(total_ants_sp = ifelse(!is.na(total_Lab) & total_Lab == 0 & is.na(total_ants_sp), 0, total_ants_sp)) %>%
  dplyr::select(sample_ID, total_ants_sp)

# check sum of individuals is the same as in corr_species_data & we have the same sample IDs as in env_data
sum(sp_totals$total_ants_sp, na.rm = T) == sum(corr_species_data$individuals, na.rm = T) 
## [1] TRUE
all(unique(sp_totals$sample_ID) == env_data$sample_ID)
## [1] TRUE
# compare number of rows with total = NA or total = 0
nrow(sp_totals[is.na(sp_totals$total_ants_sp),]); nrow(exp_totals[is.na(exp_totals$total_Lab),])
## [1] 30
## [1] 29
nrow(na.omit(sp_totals[sp_totals$total_ants_sp == 0,])) == nrow(exp_totals[exp_totals$total_Lab == 0 & !is.na(exp_totals$total_Lab),])
## [1] TRUE
# 1 more row with NA in sp_totals than exp totals: check if samples not in the species dataset have ants
na.omit(exp_totals[exp_totals$sample_ID %in% setdiff(env_data$sample_ID, tmp_sp_totals$sample_ID) & exp_totals$total_Lab > 0, c(1,4)]) 
## # A tibble: 1 × 2
##   sample_ID total_Lab
##   <chr>         <dbl>
## 1 22-11009          2
# exp 22-11009 has ants counted in the lab which were not identified and will show as NA in sp_totals


## totals per bait

# extract the total number of ants of all species per experiment and bait
tmp_sp_totals_bait <- corr_species_data %>%
  group_by(sample_ID, concentration) %>%  # Group by experiment ant concentration
  summarise(total_ants_sp_bait = sum(individuals),
    .groups = "drop")

# intermediary check that the total of ants is the same as in the initial data frame
sum(tmp_sp_totals_bait$total_ants_sp_bait, na.rm = T) == sum(corr_species_data$individuals, na.rm = T)
## [1] TRUE
# Create a complete data frame with all combinations of sample ID and concentration
sp_totals_bait <- expand.grid(
  sample_ID = unique(env_data$sample_ID), # take sample ID column from env_data to have the full 239 experiments
  concentration = unique(corr_exp_data$concentration))  # all concentrations from 0 to 40
 
# Merge complete data frame with ant bait totals
sp_totals_bait <- left_join(sp_totals_bait, tmp_sp_totals_bait, by = c("sample_ID", "concentration")) %>%
  arrange(sample_ID, concentration)

# from corr_exp_data, filter experiments where the lab total count was 0 or NA
tmp_exp_0_na <- corr_exp_data %>%
  filter(count_time == "Lab" & (individuals == 0 | is.na(individuals))) %>%
  dplyr::select(sample_ID, concentration, individuals) %>%
  rename(lab_count = individuals)

# update ant counts in bait totals with exp where lab count is 0 or NA
sp_totals_bait <- sp_totals_bait %>%
  left_join(tmp_exp_0_na, by = c("sample_ID", "concentration")) %>%
  mutate(total_ants_sp_bait = ifelse(!is.na(lab_count) & lab_count == 0 & is.na(total_ants_sp_bait), 0, total_ants_sp_bait)) %>%
  dplyr::select(sample_ID, concentration, total_ants_sp_bait) %>%
  arrange(sample_ID, concentration)

# check sum of individuals is the same as in corr_species_data & we have the same sample IDs as in env_data
sum(sp_totals_bait$total_ants_sp_bait, na.rm = T) == sum(corr_species_data$individuals, na.rm = T) 
## [1] TRUE
all(unique(sp_totals_bait$sample_ID) == env_data$sample_ID)
## [1] TRUE
# compare number of rows with total = NA or total = 0
nrow(sp_totals_bait[is.na(sp_totals_bait$total_ants_sp_bait),]); nrow(tmp_exp_0_na[is.na(tmp_exp_0_na$lab_count),])
## [1] 170
## [1] 169
nrow(na.omit(sp_totals_bait[sp_totals_bait$total_ants_sp_bait == 0,])) == nrow(tmp_exp_0_na[tmp_exp_0_na$lab_count == 0 & !is.na(tmp_exp_0_na$lab_count),])
## [1] TRUE
# 1 more row with NA in sp_totals_bait than exp totals: check if samples not in the species dataset have ants
na.omit(corr_exp_data[corr_exp_data$sample_ID %in%  setdiff(env_data$sample_ID, tmp_sp_totals_bait$sample_ID) & corr_exp_data$count_time == "Lab" & corr_exp_data$individuals > 0, c(1:2,4)]) 
## # A tibble: 1 × 3
##   sample_ID concentration individuals
##   <chr>             <dbl>       <dbl>
## 1 22-11009             10           2
# also exp 22-11009 has ants counted in the lab which were not identified and will show as NA in sp_totals

Experiment 22-11009 had 2 ants counted in the lab but no ants in the species data, meaning the species was not identified because the sample was not found. As it stands, this row is reported as NA in the species totals.

I compare the species data with the experiment data to see whether the total number of ants for each experiment matches.

## comparing ant totals per experiment

# add the total_lab column calculated previously
tmp_totals_comparison <- cbind(sp_totals, exp_totals$total_Lab)

# extract experiments where the total from the lab counting and the total from the species data does not match
exp_problem_totals <- tmp_totals_comparison[tmp_totals_comparison[,2] != tmp_totals_comparison[,3],]
exp_problem_totals <- exp_problem_totals[!is.na(exp_problem_totals$sample_ID),] # remove NA rows

# compare total ants from both sources
sum(tmp_totals_comparison[,3], na.rm = T); sum(tmp_totals_comparison$total_ants_sp, na.rm = T)
## [1] 5468
## [1] 5457
# percentage of difference between the 2 totals
1-(sum(tmp_totals_comparison$total_ants_sp, na.rm = T)/sum(tmp_totals_comparison[,3], na.rm = T))
## [1] 0.002011704
## comparing totals at each bait

# from corr_exp_data, filter lab totals at each concentration
tmp_exp_lab_counts <- corr_exp_data %>%
  filter(count_time == "Lab") %>%
  dplyr::select(sample_ID, concentration, individuals) %>%  # keep only relevant columns
  rename(lab_count = individuals)  # rename the individuals column

tmp_totals_bait_comparison <- cbind(sp_totals_bait, tmp_exp_lab_counts$lab_count)

# compare total ants from both sources
sum(tmp_totals_bait_comparison$total_ants_sp_bait, na.rm = T); sum(tmp_totals_bait_comparison[,4], na.rm = T)
## [1] 5457
## [1] 5468
# extract experiments where the total from the lab counting and the total from the species data does not match
baits_problem_totals <- tmp_totals_bait_comparison[tmp_totals_bait_comparison[,4] != tmp_totals_bait_comparison[,3],]
baits_problem_totals <- baits_problem_totals[!is.na(baits_problem_totals$sample_ID),] # remove NA rows

There are some minor discrepancies (less than 1% of the total number of ants) between the lab total and the species data total. Since a higher proportion of the species data was counted using a click counter, I consider the species data counts to be more reliable and use these as the variable for analysis over the lab totals when working with the number of ants collected at the end of the experiment.

In order to keep my global environment as clean as possible and without too many variables with similar names at once, I periodically remove all variables labeled as temporary (name starts with prefix “tmp_”) from the global environment.

rm(list = ls(pattern = "^tmp_"))
0.2.2 Scaling the number of ants collected

The number of ants collected at the end of each experiment does not account for absolute differences in abundance due to ant colony size or foraging strategy (e.g., solitary foraging vs mass recruiting). Both are important features of ant foraging and vary between species (among other factors).

In order to account for this as best as possible, I create scaled measures of ant activity which account for these absolute differences in abundances and which are appropriate for each research question.

  • Research question 1: I am looking at whether ant foraging activity is influenced by environmental conditions. For this question, I consider the total amount of ants present at the end of each experiment (all baits pooled together). In order to account for species differences in colony size and foraging strategy, I create a scaled variable to measure species-level foraging activity.
    • For each experiment, I calculate the number of ants of species i collected at the end of experiment j and offset it by the maximum number of ants of species i found on any experiment.
    • This gives me a rate for foraging activity per species for each experiment, and I can test whether this activity changes under different environmental conditions.
    • In terms of data, this means I have 1 row per species per experiment.
# sum all ants of the same species of each experiment together
tmp_species_per_exp <- corr_species_data %>%
  group_by(sample_ID, sp_abbr) %>%   # for now we group by abbreviated species name
  dplyr::summarize(individuals = sum(individuals, na.rm = T), .groups = "drop")

# List of all unique species
tmp_all_species <- unique(corr_species_data$sp_abbr)

# Create an empty data frame with 1 species per exp
species_per_exp <- expand.grid(
  sample_ID = unique(env_data$sample_ID), # take sample ID column from env_data to have all 239 experiments
  sp_abbr = tmp_all_species) %>%
  arrange(sample_ID)

# Merge this complete data frame with species counts
species_per_exp <- left_join(species_per_exp, tmp_species_per_exp, 
                                   by = c("sample_ID", "sp_abbr")) 

# update species table with rows where no ants were found
species_per_exp <- species_per_exp %>%
  left_join(sp_totals, by = c("sample_ID")) %>%  # join species totals calculated in chunk 0.2.1.4
  mutate(individuals = case_when(
    is.na(individuals) & total_ants_sp == 0 ~ 0,  # True zero: No ants on this bait
    is.na(individuals) & !is.na(total_ants_sp) ~ 0,  # Species not found → Absence is zero
    is.na(individuals) & is.na(total_ants_sp) ~ NA,  # Preserve true missing values
    .default = individuals)) %>%            # Keep original values  )) %>%
  dplyr::select(-total_ants_sp)             # Drop extra column

# check sum of individuals is the same as in corr_species_data & all sample IDs from env_data are there
sum(species_per_exp$individuals, na.rm = T) == sum(corr_species_data$individuals, na.rm = T) 
## [1] TRUE
all(unique(species_per_exp$sample_ID) == env_data$sample_ID)
## [1] TRUE
# compare number of rows where total = NA with the sp_totals (nrow with NAs/number of species)
nrow(species_per_exp[is.na(species_per_exp$individuals),])/length(tmp_all_species) == nrow(sp_totals[is.na(sp_totals$total_ants_sp),])
## [1] TRUE
all(unique(species_per_exp$sample_ID[is.na(species_per_exp$individuals)]) == sp_totals$sample_ID[is.na(sp_totals$total_ants_sp)])
## [1] TRUE
# extract maximum number of ants of each species found on any experiment
scaled_species_exp <- species_per_exp %>%
  group_by(sp_abbr) %>%
  mutate(species_max = max(individuals, na.rm = T)) %>% # add new column with max individuals of that genus across baits
  ungroup()  # remove grouping by species

# add column with scaled ant total per species
scaled_species_exp <- scaled_species_exp %>%
  mutate(scaled_species_total = (individuals/species_max))
  • Research question 3: I am testing whether ants can differentiate between 5 baits of different sugar concentrations in one experiment and whether they forage more intensively on higher sugar concentrations. For this question, I look at the amount of ants collected on each bait at the end of one experiment. To account for species differences within each experiment, I create a scaled variable to measure species-level activity within each experiment.
    • For each experiment, I calculate the number of ants of species i on concentration j / the total number of ants of species i on that experiments (sum of all concentrations).
    • In other words, I look at the proportion of ants of each species on each bait of one experiment.
    • In terms of data, this means I have 1 row per species per concentration per experiment.
    • I can also calculate overall ant activity, without looking at species differences, by simply calculating the proportion of ants on each bait of one experiment, i.e. the number of ants on bait j / the total number of ants on the experiment (sum of all baits).
# sum all ants of the same species on each bait of the same experiment together
tmp_species_per_bait <- corr_species_data %>%
  group_by(sample_ID, concentration, sp_abbr) %>%   # for now we group by abbreviated species name
  dplyr::summarize(individuals = sum(individuals, na.rm = T), .groups = "drop")

# List of all unique species
tmp_all_species <- unique(corr_species_data$sp_abbr)

# Create an empty data frame with 1 species per concentration per exp.
species_per_bait <- expand.grid(
  sample_ID = unique(env_data$sample_ID), # take sample ID column from env_data to have all 239 experiments
  concentration = unique(corr_exp_data$concentration),  # take concentration column from exp data to have all 5 []
  sp_abbr = tmp_all_species) 

# Merge this complete data frame with species counts
species_per_bait <- left_join(species_per_bait, tmp_species_per_bait, 
                                   by = c("sample_ID", "concentration", "sp_abbr")) %>%
  arrange(sample_ID, concentration)

# update species table with rows where no ants were found
species_per_bait <- species_per_bait %>%
  left_join(sp_totals_bait, by = c("sample_ID", "concentration")) %>%  # join species totals per bait calculated in chunk 0.2.4
   mutate(individuals = case_when(
    is.na(individuals) & total_ants_sp_bait == 0 ~ 0,  # True zero: No ants on this bait
    is.na(individuals) & !is.na(total_ants_sp_bait) ~ 0,  # Species not found → Absence is zero
    is.na(individuals) & is.na(total_ants_sp_bait) ~ NA,  # Preserve true missing values
    .default = individuals)) %>%            # Keep original values  )) %>%
  dplyr::select(-total_ants_sp_bait)             # Drop extra column

# check sum of individuals is the same as in corr_species_data & all sample IDs from env_data are there
sum(species_per_bait$individuals, na.rm = T) == sum(corr_species_data$individuals, na.rm = T) 
## [1] TRUE
all(unique(species_per_bait$sample_ID) == env_data$sample_ID)
## [1] TRUE
# compare number of rows where total = NA with the sp_totals (nrow with NAs/number of species)
nrow(species_per_bait[is.na(species_per_bait$individuals),])/length(tmp_all_species) == nrow(sp_totals_bait[is.na(sp_totals_bait$total_ants_sp_bait),])
## [1] TRUE
all(unique(species_per_bait$sample_ID[is.na(species_per_bait$individuals)]) == unique(sp_totals_bait$sample_ID[is.na(sp_totals_bait$total_ants_sp_bait)]))
## [1] TRUE
# calculate species total per bait
tmp_sp_bait_total <- species_per_bait %>%
  group_by(sample_ID, concentration) %>%
  dplyr::summarize(individuals = sum(individuals), .groups = "drop")

# control that species_per_bait has the same totals as species totals per bait
which(tmp_sp_bait_total$individuals != sp_totals_bait$total_ants_sp_bait)
## integer(0)
# extract total number of ants of each species found on any experiment
scaled_species_bait <- species_per_bait %>%
  group_by(sample_ID, sp_abbr) %>%
  mutate(species_total_exp = ifelse(all(is.na(individuals)), NA, sum(individuals, na.rm = TRUE)), # add new column with total individuals of that species per experiment, keep all values as NA if the entire experiment has missing values
         scaled_species_total = case_when(
           !is.na(individuals) & species_total_exp == 0 ~ 0,  # avoid divisions by 0
           is.na(individuals) ~ NA,                           # keep NA values
           .default = individuals / species_total_exp)) %>%    # scaled value
  ungroup()  # remove grouping by species
scaled_totals_bait <- sp_totals_bait %>%
  group_by(sample_ID) %>%
  mutate(ant_total_exp = ifelse(all(is.na(total_ants_sp_bait)), NA, sum(total_ants_sp_bait, na.rm = TRUE)), # add new column with total individuals per experiment, keep all values as NA if the entire experiment has missing values
         scaled_ant_total = case_when(
           !is.na(total_ants_sp_bait) & ant_total_exp == 0 ~ 0,  # avoid divisions by 0
           is.na(total_ants_sp_bait) ~ NA,                       # keep NA values
           .default = total_ants_sp_bait / ant_total_exp)) %>%   # scaled value
  ungroup()  # remove grouping by species
  • Research question 4: this research question is different from the previous in that here, I am looking at what happens during the baiting experiment. As a follow-up to question 3, if ants can differentiate between different sugar concentrations, I want to test how fast this differentiation happens. If there is a preference for higher sugar concentrations, does it establish immediately or over time ? For this question, I look at all ants present on one experiment at one counting time point and see what proportion of these ants is found on each bait. Since this counted during the experiment, there is no species information so I cannot use species as a scaling factor.
    • Instead, for each experiment, I calculate the number of ants on bait j at time point h / the total number of ants on the experiment (sum of all baits) at time point h.
    • This gives me the proportion of ants on each sugar concentration bait at time point h.
    • In terms of data, I have 1 row per time point (5, 10, 20, 40, 80, 120 minutes) per concentration (0, 5, 10, 20, 40%) per experiment.
# select relevant columns and count times
scaled_timecount <- corr_exp_data %>%
  select(sample_ID, concentration, count_time, individuals) %>% # keep only relevant columns
  filter(count_time %in% c(5, 10, 20, 40, 80, 120))  # remove frozen and lab counts

# calculate total ants per time point + scaled column
scaled_timecount <- scaled_timecount %>%
  group_by(sample_ID, count_time) %>%
  mutate(total_ants_time = sum(individuals, na.rm = T)) %>%  # count total of ants at each time point
  ungroup() %>%
  mutate(scaled_ants_time = case_when(                      # add scaled column
           !is.na(individuals) & total_ants_time == 0 ~ 0,  # avoid divisions by 0
           is.na(individuals) ~ NA,                           # keep NA values
           .default = individuals / total_ants_time))    # scaled value
rm(list = ls(pattern = "^tmp_"))

0.3 Preparation of environmental data

# create a data frame with only relevant columns for analysis
cut_env_data <- cbind(env_data[,1:2], env_data[,50], env_data[,3:7], 
                      env_data[,9], env_data[,12], env_data[,14:47])
0.3.1 Time and date data

Both time and date data need to be converted into numerical values in order to run the statistical analysis.

For the time data, I filter the env_data Google Sheet and see that the earliest experiment start time is 08:10, and the latest is 18:36. I can turn the time of day data into numerical data by converting the time data into minutes since 8 AM.

# adding a new column to the table with minutes since 8 am
cut_env_data <- cut_env_data %>%
  mutate(minutes_since_8am = (hour(start_time) * 60 + minute(start_time)) - (8 * 60)) %>%
  relocate(minutes_since_8am, .after = end_time)  #  re-order columns

# View transformed time column
head(cut_env_data[, c("start_time", "minutes_since_8am")])
##   start_time minutes_since_8am
## 1   09:30:00                90
## 2   09:56:00               116
## 3   10:58:00               178
## 4   09:50:00               110
## 5   10:20:00               140
## 6   10:35:00               155

For the date data, I can convert it into numerical values by finding the earliest experiment day and considering it as day 1 and assigning numbers to all subsequent days relative to the first experiment day. I am using this method instead of considering January 1st as day 1 because 2024, unlike 2023 and 2022, was a leap year so it has 1 more day in February.

# Create a new column that maps each date to a fixed year (e.g., 1999)
# This ensures that only the month and day are considered.
cut_env_data <- cut_env_data %>%
  mutate(date_day_month = as.Date(paste0("1999-", format(date, "%m-%d")))) %>%
  relocate(date_day_month, .before = start_time)  #  re-order columns

# Find the earliest day (month and day) across all experiments in the fixed-year space
tmp_first_experiment_fixed <- min(cut_env_data$date_day_month, na.rm = T)

# Create a new column with the relative day (with the earliest experiment as day 1)
# Create a year column
cut_env_data <- cut_env_data %>%
  mutate(relative_day = as.numeric(difftime(date_day_month, tmp_first_experiment_fixed, units = "days")) + 1,
         year = year(date)) %>%
  relocate(relative_day, .before = start_time) %>%
  relocate(year, .after = date_time)  #  re-order columns

# Check the results
head(cut_env_data[, c("date", "date_day_month", "relative_day")])
##         date date_day_month relative_day
## 1 2022-05-13     1999-05-13           32
## 2 2022-05-20     1999-05-20           39
## 3 2022-05-23     1999-05-23           42
## 4 2022-05-20     1999-05-20           39
## 5 2022-05-25     1999-05-25           44
## 6 2022-05-20     1999-05-20           39
rm(list = ls(pattern = "^tmp_"))
0.3.2 Weather conditions data

In the Ant Picnic data sheets, the information regarding the presence or absence of rain during the experiment and the level of cloud cover were merged into a “weather” category, combining the 4 possibilities of “no clouds”, “lightly cloudy”, “very cloudy” and “rain”. For my analysis, I need to dissociate the presence/absence of rain (a binary variable) from the cloud cover level.

All experiments with rain were also very cloudy. Therefore, I can add a “rain” column and replace the “rain, very cloudy” entries with very cloudy in the newly renamed “clouds” column.

# view the levels of the weather column
levels(as.factor(cut_env_data$weather)) # all rainy exp were also very cloudy
## [1] "lightly cloudy"    "no clouds"         "rain, very cloudy"
## [4] "very cloudy"
# create new rain column and rename weather column
cut_env_data <- cut_env_data %>%
  mutate(rain = if_else(weather == "rain, very cloudy", "rain", "no rain"), 
         weather = if_else(weather == "rain, very cloudy", "very cloudy", weather)) %>%
  rename(clouds = weather)  %>%  # renaming the weather column
  relocate(rain, .after = wind)  #  re-order columns

# check known experiments with rain
print(cut_env_data[cut_env_data$sample_ID %in% c("23-22001", "23-22003","23-22007", "24-11083", "24-11084", "24-11085", "24-11086"), c("clouds", "rain")])
##          clouds rain
## 88  very cloudy rain
## 90  very cloudy rain
## 92  very cloudy rain
## 212 very cloudy rain
## 213 very cloudy rain
## 214 very cloudy rain
## 215 very cloudy rain
# check that number of experiments with rain = 7
nrow(na.omit(cut_env_data[cut_env_data$rain == "rain", c("sample_ID", "clouds", "rain")]))
## [1] 7

I also need to encode my categorical variables as numbers for later analysis. For rain, it is coded as 0 = no rain and 1 = rain. For clouds, wind, and shade, it is coded as 1 = none, 2 = intermediate (light wind/clouds, half-shade), 3 = very (or full for shade).

# check the levels for all categorical variables
levels(as.factor(cut_env_data$clouds)) 
## [1] "lightly cloudy" "no clouds"      "very cloudy"
levels(as.factor(cut_env_data$wind)) 
## [1] "no wind"     "strong wind" "weak wind"
levels(as.factor(cut_env_data$shade))
## [1] "full shade" "half-shade" "no shade"
levels(as.factor(cut_env_data$rain)) 
## [1] "no rain" "rain"
# recode the categorical columns
cut_env_data <- cut_env_data %>%
  mutate(clouds_numeric = dplyr::recode(clouds, 
                                 "no clouds" = 1, 
                                 "lightly cloudy" = 2, 
                                 "very cloudy" = 3),
         wind_numeric = dplyr::recode(wind,
                               "no wind" = 1,
                               "weak wind" = 2,
                               "strong wind" = 3),
         shade_numeric = dplyr::recode(shade, 
                                "no shade" = 1,
                                "half-shade" = 2,
                                "full shade" = 3),
         rain_numeric = dplyr::recode(rain,
                               "no rain" = 0,
                               "rain" = 1)) %>%  
  relocate(c(clouds_numeric, wind_numeric, shade_numeric, rain_numeric), .after = rain)  #  re-order columns

0.4 Creating data frames for analysis

To answer my research questions 1 (see below), I need a table which combines the scaled number of ants per species per experiment and the environmental data. I also ensure the variable formats are correct.

# add env data to scaled species per exp
env_analysis <- scaled_species_exp %>%
  left_join(cut_env_data, by = c("sample_ID")) 

# making sure all columns have the right formats
env_analysis <- env_analysis %>%
  mutate(across(c("sample_ID", "sp_abbr", "climate_logger_ID", "city", "site"), as.factor),
         clouds = factor(clouds, levels = c("no clouds", "lightly cloudy", "very cloudy")),  # specify order of factor levels
         shade = factor(shade, levels = c("no shade", "half-shade", "full shade")),
         wind = factor(wind, levels = c("no wind", "weak wind", "strong wind")),
         rain = factor(rain, levels = c("no rain", "rain")),
         across(c("individuals", "species_max", "scaled_species_total", "year", 
                  "relative_day", "minutes_since_8am", "latitude", "longitude", 
                  "temperature"), as.numeric),
         across(c(24:57), as.numeric)) # set all imp and climate logger variables to numeric

str(env_analysis)
## tibble [6,453 × 57] (S3: tbl_df/tbl/data.frame)
##  $ sample_ID           : Factor w/ 239 levels "22-10000","22-11002",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ sp_abbr             : Factor w/ 27 levels "D. quadripunctatus",..: 12 13 21 6 22 27 3 15 26 16 ...
##  $ individuals         : num [1:6453] 0 0 0 0 0 0 0 0 0 0 ...
##  $ species_max         : num [1:6453] 306 2 6 11 47 502 2 15 27 18 ...
##  $ scaled_species_total: num [1:6453] 0 0 0 0 0 0 0 0 0 0 ...
##  $ date                : Date[1:6453], format: "2022-05-13" "2022-05-13" ...
##  $ date_time           : POSIXct[1:6453], format: "2022-05-13 09:30:00" "2022-05-13 09:30:00" ...
##  $ year                : num [1:6453] 2022 2022 2022 2022 2022 ...
##  $ date_day_month      : Date[1:6453], format: "1999-05-13" "1999-05-13" ...
##  $ relative_day        : num [1:6453] 32 32 32 32 32 32 32 32 32 32 ...
##  $ start_time          : 'hms' num [1:6453] 09:30:00 09:30:00 09:30:00 09:30:00 ...
##   ..- attr(*, "units")= chr "secs"
##  $ end_time            : 'hms' num [1:6453] 11:26:00 11:26:00 11:26:00 11:26:00 ...
##   ..- attr(*, "units")= chr "secs"
##  $ minutes_since_8am   : num [1:6453] 90 90 90 90 90 90 90 90 90 90 ...
##  $ climate_logger_ID   : Factor w/ 101 levels "94226711","94226712",..: 75 75 75 75 75 75 75 75 75 75 ...
##  $ latitude            : num [1:6453] 51.5 51.5 51.5 51.5 51.5 ...
##  $ longitude           : num [1:6453] 12 12 12 12 12 ...
##  $ city                : Factor w/ 3 levels "Berlin","Halle",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ site                : Factor w/ 39 levels "Anna-Essinger-Gemeinschaftsschule",..: 10 10 10 10 10 10 10 10 10 10 ...
##  $ temperature         : num [1:6453] 19 19 19 19 19 19 19 19 19 19 ...
##  $ clouds              : Factor w/ 3 levels "no clouds","lightly cloudy",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ shade               : Factor w/ 3 levels "no shade","half-shade",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ wind                : Factor w/ 3 levels "no wind","weak wind",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ rain                : Factor w/ 2 levels "no rain","rain": 1 1 1 1 1 1 1 1 1 1 ...
##  $ clouds_numeric      : num [1:6453] 2 2 2 2 2 2 2 2 2 2 ...
##  $ wind_numeric        : num [1:6453] 3 3 3 3 3 3 3 3 3 3 ...
##  $ shade_numeric       : num [1:6453] 2 2 2 2 2 2 2 2 2 2 ...
##  $ rain_numeric        : num [1:6453] 0 0 0 0 0 0 0 0 0 0 ...
##  $ imp_mean            : num [1:6453] 37.4 37.4 37.4 37.4 37.4 ...
##  $ imp_median          : num [1:6453] 22 22 22 22 22 22 22 22 22 22 ...
##  $ imp_stdev           : num [1:6453] 40.5 40.5 40.5 40.5 40.5 ...
##  $ imp_min             : num [1:6453] 0 0 0 0 0 0 0 0 0 0 ...
##  $ imp_max             : num [1:6453] 100 100 100 100 100 100 100 100 100 100 ...
##  $ t1_mean             : num [1:6453] 18.4 18.4 18.4 18.4 18.4 ...
##  $ t1_max              : num [1:6453] 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 ...
##  $ t1_min              : num [1:6453] 18.2 18.2 18.2 18.2 18.2 ...
##  $ t1_sd               : num [1:6453] 0.107 0.107 0.107 0.107 0.107 ...
##  $ t1_var              : num [1:6453] 0.0113 0.0113 0.0113 0.0113 0.0113 ...
##  $ t2_mean             : num [1:6453] 18.9 18.9 18.9 18.9 18.9 ...
##  $ t2_max              : num [1:6453] 19.4 19.4 19.4 19.4 19.4 ...
##  $ t2_min              : num [1:6453] 18.4 18.4 18.4 18.4 18.4 ...
##  $ t2_sd               : num [1:6453] 0.387 0.387 0.387 0.387 0.387 ...
##  $ t2_var              : num [1:6453] 0.15 0.15 0.15 0.15 0.15 ...
##  $ t3_mean             : num [1:6453] 18.2 18.2 18.2 18.2 18.2 ...
##  $ t3_max              : num [1:6453] 18.8 18.8 18.8 18.8 18.8 ...
##  $ t3_min              : num [1:6453] 17.7 17.7 17.7 17.7 17.7 ...
##  $ t3_sd               : num [1:6453] 0.38 0.38 0.38 0.38 0.38 ...
##  $ t3_var              : num [1:6453] 0.145 0.145 0.145 0.145 0.145 ...
##  $ raw_moist_mean      : num [1:6453] 741 741 741 741 741 ...
##  $ raw_moist_max       : num [1:6453] 748 748 748 748 748 748 748 748 748 748 ...
##  $ raw_moist_min       : num [1:6453] 733 733 733 733 733 733 733 733 733 733 ...
##  $ raw_moist_sd        : num [1:6453] 5.06 5.06 5.06 5.06 5.06 ...
##  $ raw_moist_var       : num [1:6453] 25.6 25.6 25.6 25.6 25.6 ...
##  $ vol_moist_mean      : num [1:6453] 1.97 1.97 1.97 1.97 1.97 ...
##  $ vol_moist_max       : num [1:6453] 2.1 2.1 2.1 2.1 2.1 2.1 2.1 2.1 2.1 2.1 ...
##  $ vol_moist_min       : num [1:6453] 1.8 1.8 1.8 1.8 1.8 1.8 1.8 1.8 1.8 1.8 ...
##  $ vol_moist_sd        : num [1:6453] 0.111 0.111 0.111 0.111 0.111 ...
##  $ vol_moist_var       : num [1:6453] 0.0124 0.0124 0.0124 0.0124 0.0124 ...

To answer my research question 2 (see below), I need a table which combines the scaled number of ants per species per bait per experiment and the “site” variables which will be modeled as a random effect. I also add a “presence” column which records whether there was > 0 ants present on the baits and ensure the variable formats are correct.

# adding a site column for the random effect + a presence column
sugar_analysis <- scaled_species_bait %>%
  left_join(env_data[,c("sample_ID", "site")], by = c("sample_ID")) %>%  # add site column
  mutate(presence = ifelse(individuals > 0, 1, 0)) %>%  # add presence/absence column
  relocate(presence, .before = site)

# making sure all columns have the right formats
sugar_analysis <- sugar_analysis %>%
  mutate(across(c("sample_ID", "sp_abbr", "site"), as.factor),
         across(c("individuals", "concentration", "species_total_exp", "scaled_species_total", 
                  "presence"), as.numeric))

str(sugar_analysis)
## tibble [32,265 × 8] (S3: tbl_df/tbl/data.frame)
##  $ sample_ID           : Factor w/ 239 levels "22-10000","22-11002",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ concentration       : num [1:32265] 0 0 0 0 0 0 0 0 0 0 ...
##  $ sp_abbr             : Factor w/ 27 levels "D. quadripunctatus",..: 12 13 21 6 22 27 3 15 26 16 ...
##  $ individuals         : num [1:32265] 0 0 0 0 0 0 0 0 0 0 ...
##  $ species_total_exp   : num [1:32265] 0 0 0 0 0 0 0 0 0 0 ...
##  $ scaled_species_total: num [1:32265] 0 0 0 0 0 0 0 0 0 0 ...
##  $ presence            : num [1:32265] 0 0 0 0 0 0 0 0 0 0 ...
##  $ site                : Factor w/ 39 levels "Anna-Essinger-Gemeinschaftsschule",..: 10 10 10 10 10 10 10 10 10 10 ...
# data frame for overall activity (not species-level)
sugar_analysis_overall <- scaled_totals_bait %>%
  left_join(env_data[,c("sample_ID", "site")], by = c("sample_ID")) %>%
  rename(individuals = total_ants_sp_bait) %>% 
  mutate(presence = ifelse(individuals > 0, 1, 0)) %>%  # add presence/absence column
  relocate(presence, .before = site)

# making sure all columns have the right formats
sugar_analysis_overall <- sugar_analysis_overall %>%
  mutate(across(c("sample_ID", "site"), as.factor),
         across(c("individuals", "concentration", "ant_total_exp", "scaled_ant_total", 
                  "presence"), as.numeric))

str(sugar_analysis_overall)
## tibble [1,195 × 7] (S3: tbl_df/tbl/data.frame)
##  $ sample_ID       : Factor w/ 239 levels "22-10000","22-11002",..: 1 1 1 1 1 2 2 2 2 2 ...
##  $ concentration   : num [1:1195] 0 5 10 20 40 0 5 10 20 40 ...
##  $ individuals     : num [1:1195] 0 0 0 0 0 0 1 2 0 37 ...
##  $ ant_total_exp   : num [1:1195] 0 0 0 0 0 40 40 40 40 40 ...
##  $ scaled_ant_total: num [1:1195] 0 0 0 0 0 0 0.025 0.05 0 0.925 ...
##  $ presence        : num [1:1195] 0 0 0 0 0 0 1 1 0 1 ...
##  $ site            : Factor w/ 39 levels "Anna-Essinger-Gemeinschaftsschule",..: 10 10 10 10 10 37 37 37 37 37 ...

For research question 3, I need a table which combines the scaled number of ants per counting time point per experiment and the “site” variables which will be modeled as a random effect. I also add a “presence” column which records whether there was > 0 ants present on the baits and ensure the variable formats are correct.

# add site and presence column to scaled_timecount
speed_analysis <- scaled_timecount %>%
  left_join(env_data[,c("sample_ID", "site")], by = c("sample_ID")) %>%  # add site column  
  mutate(presence = ifelse(individuals > 0, 1, 0)) %>%  # add presence/absence column
  relocate(presence, .before = site)

# making sure all columns have the right formats
speed_analysis <- speed_analysis %>%
  mutate(across(c("sample_ID", "concentration", "site"), as.factor),
         across(c("count_time", "individuals", "total_ants_time", "scaled_ants_time", 
                  "presence"), as.numeric))

str(speed_analysis)
## tibble [7,170 × 8] (S3: tbl_df/tbl/data.frame)
##  $ sample_ID       : Factor w/ 239 levels "22-10000","22-11002",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ concentration   : Factor w/ 5 levels "0","5","10","20",..: 1 1 1 1 1 1 2 2 2 2 ...
##  $ count_time      : num [1:7170] 5 10 20 40 80 120 5 10 20 40 ...
##  $ individuals     : num [1:7170] 0 0 0 0 1 0 0 0 0 0 ...
##  $ total_ants_time : num [1:7170] 0 0 0 1 2 0 0 0 0 1 ...
##  $ scaled_ants_time: num [1:7170] 0 0 0 0 0.5 0 0 0 0 0 ...
##  $ presence        : num [1:7170] 0 0 0 0 1 0 0 0 0 0 ...
##  $ site            : Factor w/ 39 levels "Anna-Essinger-Gemeinschaftsschule",..: 10 10 10 10 10 10 10 10 10 10 ...

0.5 General data visualization

0.5.1 Validating citizen science data

Since the Ant Picnic experiments were conducted both by citizens and a scientist, it is interesting to compare the accuracy of some aspects of the data between citizen participants and scientists. Moreover, it is interesting to check how the participant-recorded measures such as temperature compare with the climate logger.

I start by comparing the microclimate logger temperatures (both ground temperature and aboveground temperature) with the participant-recorded temperatures (from a weather app).

# ground temperature vs weather app temperature, per year
(tmp_t2_temp_plot <- ggplot(na.omit(env_analysis), aes(x = temperature, y = t2_mean, color = as.factor(year)))+
   geom_point()+
   geom_smooth(method = lm, se = F)+
   geom_abline(linetype = 2, color = "red")+ # add 1:1 identity line in red and dashed
   scale_color_paletteer_d("nationalparkcolors::Acadia") +  # Apply Acadia discrete color scale
   labs(x = "Weather app temperature (\u00B0C)", 
        y = "Ground temperature (microclimate logger) (\u00B0C)", color = "Year") +
   theme_classic2() +
   theme(legend.text = element_text(size = 8),  # Smaller legend text
        legend.title = element_text(size = 12, hjust = 0.5, vjust = 2),  # Smaller legend title
        legend.key.size = unit(0.6, "cm"),  # Makes the legend squares smaller
        axis.title.x = element_text(margin = margin(t = 10)),  # adjust distance from axis title to axis
        axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5)) # adjust distance from axis title to axis, center title
 )
## `geom_smooth()` using formula = 'y ~ x'

# above ground temperature vs weather app temperature, per year
(tmp_t3_temp_plot <- ggplot(na.omit(env_analysis), aes(x = temperature, y = t3_mean, color = as.factor(year)))+
   geom_point()+
   geom_smooth(method = lm, se = F)+
   geom_abline(linetype = 2, color = "red")+ # add 1:1 identity line in red and dashed
   scale_color_paletteer_d("nationalparkcolors::Acadia") +  # Apply Acadia discrete color scale
   labs(x = "Weather app temperature (\u00B0C)", 
        y = "Aboveground temperature (microclimate logger) (\u00B0C)", color = "Year") +
   theme_classic2() +
   theme(legend.text = element_text(size = 8),  # Smaller legend text
        legend.title = element_text(size = 12, hjust = 0.5, vjust = 2),  # Smaller legend title
        legend.key.size = unit(0.6, "cm"),  # Makes the legend squares smaller
        axis.title.x = element_text(margin = margin(t = 10)),  # adjust distance from axis title to axis
        axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5)) # adjust distance from axis title to axis, center title
 )
## `geom_smooth()` using formula = 'y ~ x'

# ground temp vs weather app temperature, all years pooled
(tmp_t2_temp_all_plot <- ggplot(na.omit(env_analysis), aes(x = temperature, y = t2_mean))+
   geom_point(color = "black")+
   geom_smooth(method = lm, se = F, , color = "black")+
   geom_abline(linetype = 2, color = "red")+ # add 1:1 identity line in red and dashed
   labs(x = "Weather app temperature (\u00B0C)", 
        y = "Ground temperature (microclimate logger) (\u00B0C)") +
   theme_classic2() +
   theme(axis.title.x = element_text(margin = margin(t = 10)),  # adjust distance from axis title to axis
        axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5)) # adjust distance from axis title to axis, center title
 )
## `geom_smooth()` using formula = 'y ~ x'

# aboveground temp vs weather app temperature, all years pooled
(tmp_t3_temp_all_plot <- ggplot(na.omit(env_analysis), aes(x = temperature, y = t3_mean))+
   geom_point(color = "black")+
   geom_smooth(method = lm, se = F, , color = "black")+
   geom_abline(linetype = 2, color = "red")+ # add 1:1 identity line in red and dashed
   labs(x = "Weather app temperature (\u00B0C)", 
        y = "Aboveground temperature (microclimate logger) (\u00B0C)") +
   theme_classic2() +
   theme(axis.title.x = element_text(margin = margin(t = 10)),  # adjust distance from axis title to axis
        axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5)) # adjust distance from axis title to axis, center title
 )
## `geom_smooth()` using formula = 'y ~ x'

It appears that all temperature measures correlate fairly well, with ground temperatures usually being a bit warmer than weather app temperatures, especially at lower temperatures.

I can also compare the counting accuracy between citizens and a scientist, both at the end of the experiment while collecting the ants (120 minutes) and after freezing with the number of ants counted in the lab during species identification.

# add a citizen vs scientist column
tmp_count_accuracy <- exp_totals %>%
  dplyr::select(1:3) %>%
  left_join(sp_totals, by = "sample_ID") %>%  # add lab totals from species data
  filter(total_ants_sp > 0) %>%   # only keep rows with ants
  mutate(experimenter = case_when(    # create new experimenter column
    grepl("^22-|^23-", sample_ID) ~ "citizens",  # if sample ID starts with 22 or 23, assign citizen
    grepl("^24-", sample_ID) ~ "scientist",  # if sample ID starts with 24, assign scientist
    .default = NA_character_  # Assigns NA if none of the conditions match
  ))

# plot count accuracy after 120 minutes, citizens vs scientist
(tmp_count_acc_120_plot <- ggplot(na.omit(tmp_count_accuracy), aes(x = total_120, y = total_ants_sp, color = as.factor(experimenter)))+
   geom_point()+
   geom_smooth(method = lm, se = F)+
   geom_abline(linetype = 2, color = "red")+ # add 1:1 identity line in red and dashed
   scale_color_paletteer_d("nationalparkcolors::Acadia") +  # Apply Acadia discrete color scale
   scale_x_continuous(limits = c(0, max(tmp_count_accuracy$total_ants_sp, na.rm = T))) +
   scale_y_continuous(limits = c(0, max(tmp_count_accuracy$total_ants_sp, na.rm = T))) +
   labs(x = "Number of ants counted after 120 minutes", 
        y = "Number of ants counted in the lab", color = "Experimenter") +
   theme_classic2() +
   theme(legend.text = element_text(size = 8),  # Smaller legend text
        legend.title = element_text(size = 12, hjust = 0.5, vjust = 2),  # Smaller legend title
        legend.key.size = unit(0.6, "cm"),  # Makes the legend squares smaller
        axis.title.x = element_text(margin = margin(t = 10)),  # adjust distance from axis title to axis
        axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5)) # adjust distance from axis title to axis, center title
 )
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

# plot count accuracy after freezing, citizens vs scientist
(tmp_count_acc_frozen_plot <- ggplot(na.omit(tmp_count_accuracy), aes(x = total_Frozen, y = total_ants_sp, color = as.factor(experimenter)))+
   geom_point()+
   geom_smooth(method = lm, se = F)+
   geom_abline(linetype = 2, color = "red")+ # add 1:1 identity line in red and dashed
   scale_color_paletteer_d("nationalparkcolors::Acadia") +  # Apply Acadia discrete color scale
   scale_x_continuous(limits = c(0, max(tmp_count_accuracy$total_ants_sp, na.rm = T))) +
   scale_y_continuous(limits = c(0, max(tmp_count_accuracy$total_ants_sp, na.rm = T))) +
   labs(x = "Number of ants counted after freezing", 
        y = "Number of ants counted in the lab", color = "Experimenter") +
   theme_classic2() +
   theme(legend.text = element_text(size = 8),  # Smaller legend text
        legend.title = element_text(size = 12, hjust = 0.5, vjust = 2),  # Smaller legend title
        legend.key.size = unit(0.6, "cm"),  # Makes the legend squares smaller
        axis.title.x = element_text(margin = margin(t = 10)),  # adjust distance from axis title to axis
        axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5)) # adjust distance from axis title to axis, center title
 )
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_smooth()`).

Overall, the graph after 120 minutes shows that while ant counts are relatively accurate when few ants are present, counts during the experiment with live ants running around become somewhat wild conjectures when ants are numerous, e.g. above 100 ants. Surprisingly, it appears that citizens were more accurate when counting the ants after 120 minutes than the scientist (me). However, this might be due to the fact that one sample (23-22069) had 159 ants estimated at 120 minutes but only 6 ants counted in the lab.

In my case, part of the inaccuracy comes from the fact that often, many ants were foraging on the baits from underneath the A6 bait card, through the wet card, and were thus invisible when counting before picking up the card. The way I proceeded was to count the visible ants, pick up the card and place it in the resealable bag, and write down whether ants were present under the bait card. When only a few ants were under it and I could clearly count them, I added them to the total, but when too many ants to be able to count were under the card, I only wrote it as a comment instead of making a wild guess as to how many ants were under the card. For example, the strong outlier with > 300 ants in the card is experiment 24-11108, where I counted 76 ants visible at 120 minutes and wrote “lots [of ants] under card” as a comment.

Since I cannot know how the different participants dealt with counts and whether they encountered this issue of ants under the cards at all, it is difficult to know exactly what the difference between me and the citizens is due to.

rm(list = ls(pattern = "^tmp_"))
0.5.2 Visualization of ant species and experiment distributions

As I have also identified the species of ants found in the different experiments, I can visualize the diversity of ant species found, whether overall, per year or per species. I plot both the occurence (number of experiments where a species occured) and the number of individuals of each species found.

I also plot the number of experiments done per year and per city.

# create table with relevant columns
tmp_sp_distr <- env_analysis %>%
  dplyr::select(c(sample_ID, sp_abbr, individuals, date, year, city)) %>%  # select relevant columns
  filter(individuals > 0 & !is.na(individuals) & !is.na(city))  # only keep rows with ants

# distribution of experiments per year
(tmp_exp_distr_year_plot <- ggplot(na.omit(cut_env_data), aes(x = as.factor(year), fill = as.factor(year))) +
  geom_bar(position = "dodge") +  
  scale_fill_paletteer_d("nationalparkcolors::Acadia") +  # Apply Acadia discrete color scale
  labs(y = "Number of experiments") +
   theme_bw() +
   theme(legend.position = "none",  # set legend text
        axis.title.x = element_blank(),  # adjust distance from axis title to axis, set text size
        axis.title.y = element_text(margin = margin(r = 10), size = 12)) # adjust distance from axis title to axis, center title
)

# distribution of experiments per city
(tmp_exp_distr_city_plot <- ggplot(na.omit(cut_env_data), aes(x = as.factor(city), fill = as.factor(city))) +
  geom_bar(position = "dodge") +  
  scale_fill_paletteer_d("nationalparkcolors::Acadia") +  # Apply Acadia discrete color scale
  labs(x = "City", 
        y = "Number of experiments") +
   theme_bw() +
   theme(legend.position = "none",  # set legend text
        axis.title.x = element_blank(),  # adjust distance from axis title to axis, set text size
        axis.title.y = element_text(margin = margin(r = 10), size = 12)) # adjust distance from axis title to axis, center title
)

# distribution of experiments per year and per city
(tmp_exp_distr_year_city_plot <- ggplot(na.omit(cut_env_data), aes(x = as.factor(year), fill = as.factor(city))) +
  geom_bar(position = "dodge") +  
  scale_fill_paletteer_d("nationalparkcolors::Acadia") +  # Apply Acadia discrete color scale
  labs(x = "Year", y = "Number of experiments", fill = "City") +
   theme_bw() +
   theme(legend.text = element_text(size = 10),  # set legend text
        legend.title = element_text(size = 12, hjust = 0.5, vjust = 2),  # Smaller legend title
        legend.key.size = unit(0.5, "cm"),  # Makes the legend squares smaller
        axis.title.x = element_blank(),  # adjust distance from axis title to axis, set text size
        axis.title.y = element_text(margin = margin(r = 10), size = 12)) # adjust distance from axis title to axis, center title
)

# distribution of species per year
(tmp_sp_distr_year_plot <- ggplot(tmp_sp_distr, aes(x = as.factor(year), fill = as.factor(sp_abbr))) +
  geom_bar(position = "dodge") +  
  scale_fill_viridis_d() +  # Apply Acadia discrete color scale
  labs(x = "Number of species occuring each year", 
        y = "Number of experiments", fill = "Species") +
   theme_bw() +
   theme(legend.text = element_text(size = 10, face = "italic"),  # set legend text
        legend.title = element_text(size = 12, hjust = 0.5, vjust = 2),  # Smaller legend title
        legend.key.size = unit(0.5, "cm"),  # Makes the legend squares smaller
        axis.title.x = element_text(margin = margin(t = 10), size = 12),  # adjust distance from axis title to axis, set text size
        axis.title.y = element_text(margin = margin(r = 10), size = 12))+ # adjust distance from axis title to axis, center title
    guides(fill = guide_legend(ncol = 1))  # Set legend to 1 column
)

# distribution of species per city
(tmp_sp_distr_city_plot <- ggplot(tmp_sp_distr, aes(x = as.factor(city), fill = as.factor(sp_abbr))) +
  geom_bar(position = "dodge") +  
  scale_fill_viridis_d() +  # Apply Acadia discrete color scale
  labs(x = "Number of species occuring in each city", 
        y = "Number of experiments", fill = "Species") +
   theme_bw() +
   theme(legend.text = element_text(size = 10, face = "italic"),  # set legend text
        legend.title = element_text(size = 12, hjust = 0.5, vjust = 2),  # Smaller legend title
        legend.key.size = unit(0.5, "cm"),  # Makes the legend squares smaller
        axis.title.x = element_text(margin = margin(t = 10), size = 12),  # adjust distance from axis title to axis, set text size
        axis.title.y = element_text(margin = margin(r = 10), size = 12))+ # adjust distance from axis title to axis, center title
    guides(fill = guide_legend(ncol = 1))  # Set legend to 1 column
)

# overall species occurrence
(tmp_species_occ_plot <- ggplot(tmp_sp_distr, aes(x=sp_abbr, fill = sp_abbr))+  
    geom_bar()+ 
    scale_fill_viridis_d()+
    labs(y = "Number of experiments") +
    theme_classic()+
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size = 12, face = "italic"),
          legend.position = "none",
          axis.title.x = element_blank(),  # adjust distance from axis title to axis
          axis.title.y = element_text(size = 12, margin = margin(r = 15), hjust = 0.5)) # adjust distance from axis title to axis, center title
)

# number of ants per species
(tmp_species_ind_plot <- ggplot(tmp_sp_distr, aes(x=sp_abbr, y=individuals, fill = sp_abbr))+  
    geom_col()+ 
    scale_fill_viridis_d()+
    labs(y = "Number of ant individuals") +
    theme_classic()+
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size = 12, face = "italic"),
          legend.position = "none",
          axis.title.x = element_blank(),  # adjust distance from axis title to axis
          axis.title.y = element_text(size = 12, margin = margin(r = 15), hjust = 0.5)) # adjust distance from axis title to axis, center title
)

(tmp_species_grouped_plot <- tmp_species_ind_plot + tmp_species_ind_plot +
    plot_layout(nrow = 2) + plot_annotation(tag_levels = 'a', tag_suffix = ")"))

Since I visited the 9 sites of my 2024 fieldwork 3 times over the span of 3 months, we can also plot the species diversity by month and by site for the 2024 experiments.

# create table with relevant columns and rows
tmp_sp_distr_24 <- env_analysis %>%
  dplyr::select(c(sample_ID, sp_abbr, individuals, date_day_month, year, site)) %>%  # select relevant columns
  filter(individuals > 0 & !is.na(individuals) & year == 2024 & month(date_day_month) != 4) %>%  # only keep rows with ants and 2024 data
  mutate(month = case_when(    # create new month column
    (month(date_day_month) == 5 | as.character(date_day_month) == "1999-06-04") ~ "May",  
    (as.character(date_day_month) != "1999-06-04" & month(date_day_month) == 6) ~ "June",
    month(date_day_month) > 6 ~ "July",
    .default = NULL)) %>%
  mutate(month = factor(month, levels = c("May", "June", "July")))  # Convert month to factor with ordered levels

# Create the plot per month
(tmp_map_sp24_month <- ggplot(tmp_sp_distr_24, aes(x = month, fill = sp_abbr)) +
  geom_bar(position = "dodge") +  
  facet_wrap(~ site) +  
  scale_fill_viridis_d(option = "D") +  # Apply Viridis discrete color scale
  labs(x = "Species occurrence per month", y = "Number of experiments per site", fill = "Species") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),  # Rotate x-axis labels
        legend.text = element_text(size = 10, face = "italic"),  # Smaller legend text
        legend.title = element_text(size = 12, hjust = 0.5, vjust = 2),  # Smaller legend title
        legend.key.size = unit(0.6, "cm"),  # Makes the legend squares smaller
        strip.text = element_text(size = 10), # Smaller facet labels (location names)
        axis.title.x = element_text(margin = margin(t = 10), size = 12),  # adjust distance from axis title to axis
        axis.title.y = element_text(margin = margin(r = 10), size = 12))+ # adjust distance from axis title to axis, center title       
  guides(fill = guide_legend(ncol = 1))  # Set legend to 1 column
)

rm(list = ls(pattern = "^tmp_"))

1. Research Question 1

The overarching research question is: is ant foraging activity influenced by environmental conditions and land use ?

More specifically, does the average number of ants collected at the end of the baiting experiments depend on ground temperature, soil moisture, reported weather conditions, time of day, date of experiment, mean impervious surface coverage in a 100-meter radius around the experiment sites, and the interaction of these factors ?

  • H0: the average number of ants found at the end of the experiments is influenced by environmental conditions or urbanization
  • H1: the average number of ants found at the end of the experiments is not influenced by environmental conditions or urbanization

Description of tested variables

  • Response variable:
    • relative number of ants collected at the end of each ant picnic experiment per species (number of ants of species i collected at the end of experiment j offset by the maximum number of ants of species found on any experiment), quantitative
  • Environmental variables measured:
    • Mean ground temperature recorded by climate loggers over the experiment time, continuous numerical variable
    • Mean underground temperature recorded by climate loggers over the experiment time, continuous numerical variable
    • Mean above ground temperature recorded by climate loggers over the experiment time, continuous numerical variable
    • Temperature recorded by participants using a weather app, discrete numerical variable
    • Mean calibrated soil moisture recorded by climate loggers over the experiment time, discrete numerical variable
    • Presence or absence of rain, recorded by participants, binary variable
    • Reported cloud conditions recorded by participants, ordinal categorical variable
    • Shade as recorded by participants, ordinal categorical variable
    • Wind conditions as recorded by participants, ordinal categorical variable
    • Mean impervious surface coverage in a 100m radius around each experiment site, continuous numerical variable
    • Time of day of the start of the experiment, with the start time recorded as minutes since 8 am, discrete numerical variable
    • Date (day and month) of the experiment, recorded as a relative date (earliest experiment in the year = 1), discrete numerical variable
  • Random effects:
    • Site, nominal categorical variable
    • Species, nominal categorical variable

1.0 Correlation analysis of environmental variables

I expect several of my explanatory variables, such as the time of day, weather, date, temperature to be correlated. Additionally, I have several measures for temperature. This correlation analysis allows to get a clearer idea of how the variables correlate and of which variables are the most relevant for my analysis.

I perform two correlation analyses, the first focusing only on the quantitative variables (excluding rain, clouds, shade, wind, city, and year) and the second including all variables. For the first analysis, I want to use the Pearson correlation coefficient, which assumes a linear relationship and that variables are normally distributed with no extreme outliers.

I start by testing the normality of the distribution of my variables as well as checking visually for outliers. From the Q-Q plots, it seems that the variables from the microclimate loggers and the temperature recorded by participants are relatively close to a normal distribution, although with some deviation at extreme values. the impervious surface, experiment start time, and date of experiment show clear deviation from normality. The results of the Shapiro-Wilk test confirm that all variables significantly differ from a normal distribution.

Since my continuous variables are not normally distributed, both correlation analyses (only quantitative variables, both quantitative and qualitative variables) are run using Spearman’s rank correlation coefficient.

#create a data frame with only the numerical variables
tmp_cor_plot_env_num <- cut_env_data[,c("t1_mean", "t2_mean", "t3_mean", "temperature", "imp_mean", 
                                   "vol_moist_mean", "minutes_since_8am", 
                                   "relative_day")]

# remove NAs from data set
tmp_cor_plot_env_num <- na.omit(tmp_cor_plot_env_num)

#rename variables for plotting
names(tmp_cor_plot_env_num)[names(tmp_cor_plot_env_num) == "t1_mean"] <- "Temperature below ground"
names(tmp_cor_plot_env_num)[names(tmp_cor_plot_env_num) == "t2_mean"] <- "Temperature at ground surface"
names(tmp_cor_plot_env_num)[names(tmp_cor_plot_env_num) == "t3_mean"] <- "Temperature above ground"
names(tmp_cor_plot_env_num)[names(tmp_cor_plot_env_num) == "temperature"] <- "Temperature (participants)"
names(tmp_cor_plot_env_num)[names(tmp_cor_plot_env_num) == "imp_mean"] <- "Impervious surface coverage"
names(tmp_cor_plot_env_num)[names(tmp_cor_plot_env_num) == "vol_moist_mean"] <- "Soil moisture"
names(tmp_cor_plot_env_num)[names(tmp_cor_plot_env_num) == "minutes_since_8am"] <- "Experiment start time"
names(tmp_cor_plot_env_num)[names(tmp_cor_plot_env_num) == "relative_day"] <- "Date of experiment"

# Checking for normality of variable distributions
# create empty list
tmp_qq_plots <- list()
# plot Q-Q plots for each variable
for (i in 1:ncol(tmp_cor_plot_env_num)) {
  tmp_qq_plots[[i]] <- ggplot(tmp_cor_plot_env_num, aes(sample = .data[[colnames(tmp_cor_plot_env_num)[i]]])) +
  stat_qq()+
  stat_qq_line()+
  ggtitle(str_wrap(colnames(tmp_cor_plot_env_num)[i], width = 20))+
  theme(plot.title = element_text(size = 10))  
}

# Arrange the plots in a 2-row, 4-column grid
(tmp_env_qqplot <- wrap_plots(tmp_qq_plots) + plot_layout(nrow = 2, ncol = 4))

# saving the plot
#ggsave(
#    filename = "../images/data_exploration/env_var_qqplots.png",
#    plot = tmp_env_qqplot,
#    dpi = 300,
#    height = 12,
#    width = 20,
#    units = "cm"
#  )

# Shapiro-wilk test for normality
# Create an empty data frame to store results
tmp_shapiro_results <- data.frame(Variable = character(), P_Value = numeric(), stringsAsFactors = FALSE)

# Loop through each column in cor_plot_env_num
for (tmp_i in 1:ncol(tmp_cor_plot_env_num)) {
  tmp_col_name <- colnames(tmp_cor_plot_env_num)[tmp_i]  # Get the column name
  tmp_test_result <- shapiro.test(tmp_cor_plot_env_num[[tmp_i]])  # Perform Shapiro-Wilk test
  
  # Store results in the data frame
  tmp_shapiro_results <- rbind(tmp_shapiro_results, data.frame(Variable = tmp_col_name, P_Value = tmp_test_result$p.value))
}

# Print the variables not significantly 
print(tmp_shapiro_results)
##                        Variable      P_Value
## 1      Temperature below ground 1.032684e-02
## 2 Temperature at ground surface 2.280288e-03
## 3      Temperature above ground 1.607317e-03
## 4    Temperature (participants) 9.123991e-03
## 5   Impervious surface coverage 2.093473e-09
## 6                 Soil moisture 3.531498e-03
## 7         Experiment start time 2.765104e-10
## 8            Date of experiment 3.630909e-11
# compute the correlation matrix with p-values
tmp_env_cor_results <- rcorr(as.matrix(tmp_cor_plot_env_num), type = "spearman") 

# extract correlation matrix and print
tmp_env_cor_matrix <- tmp_env_cor_results$r; tmp_env_cor_matrix
##                               Temperature below ground
## Temperature below ground                    1.00000000
## Temperature at ground surface               0.83419975
## Temperature above ground                    0.77569343
## Temperature (participants)                  0.56721655
## Impervious surface coverage                -0.07436956
## Soil moisture                               0.11123787
## Experiment start time                       0.42773839
## Date of experiment                          0.48361254
##                               Temperature at ground surface
## Temperature below ground                          0.8341997
## Temperature at ground surface                     1.0000000
## Temperature above ground                          0.9696263
## Temperature (participants)                        0.6750450
## Impervious surface coverage                      -0.1012513
## Soil moisture                                     0.2087843
## Experiment start time                             0.3006518
## Date of experiment                                0.5032053
##                               Temperature above ground
## Temperature below ground                    0.77569343
## Temperature at ground surface               0.96962631
## Temperature above ground                    1.00000000
## Temperature (participants)                  0.76636942
## Impervious surface coverage                -0.08311249
## Soil moisture                               0.28476850
## Experiment start time                       0.33728020
## Date of experiment                          0.57436184
##                               Temperature (participants)
## Temperature below ground                      0.56721655
## Temperature at ground surface                 0.67504503
## Temperature above ground                      0.76636942
## Temperature (participants)                    1.00000000
## Impervious surface coverage                   0.03619624
## Soil moisture                                 0.24254308
## Experiment start time                         0.48148038
## Date of experiment                            0.48527967
##                               Impervious surface coverage Soil moisture
## Temperature below ground                      -0.07436956     0.1112379
## Temperature at ground surface                 -0.10125127     0.2087843
## Temperature above ground                      -0.08311249     0.2847685
## Temperature (participants)                     0.03619624     0.2425431
## Impervious surface coverage                    1.00000000    -0.1498928
## Soil moisture                                 -0.14989285     1.0000000
## Experiment start time                         -0.06093111     0.2555638
## Date of experiment                            -0.10983860     0.3984425
##                               Experiment start time Date of experiment
## Temperature below ground                 0.42773839          0.4836125
## Temperature at ground surface            0.30065183          0.5032053
## Temperature above ground                 0.33728020          0.5743618
## Temperature (participants)               0.48148038          0.4852797
## Impervious surface coverage             -0.06093111         -0.1098386
## Soil moisture                            0.25556384          0.3984425
## Experiment start time                    1.00000000          0.2355542
## Date of experiment                       0.23555420          1.0000000
# extract the p-values
tmp_env_p_matrix <- tmp_env_cor_results$P

# plot correlation matrix (only insignificant ones hidden)
tmp_corr_num <- corrplot(tmp_env_cor_matrix, 
         type = "upper",
         method = "color",
         diag = F,                         # remove diagonal
         #addCoef.col = "black",           # add coefficient coeffs, not used 
         p.mat = tmp_env_p_matrix,         # add p-values matrix
         sig.level = c(0.001, 0.01, 0.05), # significance thresholds
         tl.col = "black",                 # color of variable labels
         cl.align.text = "l",              # alignment of color legend
         cl.offset = 0.3,                  # offset color legend text to the right
         number.cex = 0.6,                 # size of coefficient text
         tl.cex = 0.7,                     # size of variable labels
         insig = "label_sig",              # add stars according to significance thresholds
         pch.cex = 1)                      # size of stars

# save the plot as a png, commented out for knitting of the rmarkdown file
# dev.copy(png, "../images/data_exploration/correlation_plot_num.png", width = 1600, height = 1600, res = 300)
# dev.off()
#create a data frame with only the numerical variables
tmp_cor_plot_env_all <- cut_env_data[,c("t1_mean", "t2_mean", "t3_mean", "temperature", 
                                        "imp_mean", "vol_moist_mean", "minutes_since_8am", 
                                        "relative_day", "clouds_numeric", "shade_numeric", 
                                        "wind_numeric", "rain_numeric", "year", "city")]

tmp_cor_plot_env_all <- tmp_cor_plot_env_all %>%
  # Code the city variable as numbers
  mutate(city = dplyr::recode(city, "Halle" = 1, "Leipzig" = 2, "Berlin" = 3)) %>%
  # Remove NAs
  na.omit() %>%
  # Rename variables for plotting
  rename(
    "Temperature below ground" = t1_mean,
    "Temperature at ground surface" = t2_mean,
    "Temperature above ground" = t3_mean,
    "Temperature (participants)" = temperature,
    "Impervious surface coverage" = imp_mean,
    "Soil moisture" = vol_moist_mean,
    "Experiment start time" = minutes_since_8am,
    "Date of experiment" = relative_day,
    "Year" = year,
    "Cloud cover (participants)" = clouds_numeric,
    "Shade (participants)" = shade_numeric,
    "Wind (participants)" = wind_numeric,
    "Rain (participants)" = rain_numeric,
    "City" = city)

# compute the correlation matrix with p-values
tmp_env_cor_all_results <- rcorr(as.matrix(tmp_cor_plot_env_all), type = "spearman") 

# extract correlation matrix and print results
tmp_env_cor_all_matrix <- tmp_env_cor_all_results$r; tmp_env_cor_all_matrix
##                               Temperature below ground
## Temperature below ground                    1.00000000
## Temperature at ground surface               0.83328957
## Temperature above ground                    0.77561643
## Temperature (participants)                  0.56644092
## Impervious surface coverage                -0.06592400
## Soil moisture                               0.11655318
## Experiment start time                       0.42729222
## Date of experiment                          0.48418019
## Cloud cover (participants)                 -0.08455477
## Shade (participants)                       -0.45324889
## Wind (participants)                         0.05391416
## Rain (participants)                         0.04470452
## Year                                        0.36997778
## City                                       -0.22694091
##                               Temperature at ground surface
## Temperature below ground                         0.83328957
## Temperature at ground surface                    1.00000000
## Temperature above ground                         0.97001775
## Temperature (participants)                       0.67232219
## Impervious surface coverage                     -0.08762115
## Soil moisture                                    0.21644768
## Experiment start time                            0.29861318
## Date of experiment                               0.50060143
## Cloud cover (participants)                      -0.31001774
## Shade (participants)                            -0.41983030
## Wind (participants)                             -0.06590040
## Rain (participants)                             -0.07483132
## Year                                             0.37967491
## City                                            -0.15631718
##                               Temperature above ground
## Temperature below ground                    0.77561643
## Temperature at ground surface               0.97001775
## Temperature above ground                    1.00000000
## Temperature (participants)                  0.76276989
## Impervious surface coverage                -0.06606175
## Soil moisture                               0.29242172
## Experiment start time                       0.33550292
## Date of experiment                          0.57260749
## Cloud cover (participants)                 -0.31793377
## Shade (participants)                       -0.32468873
## Wind (participants)                        -0.09947974
## Rain (participants)                        -0.09013773
## Year                                        0.39127259
## City                                       -0.12935776
##                               Temperature (participants)
## Temperature below ground                      0.56644092
## Temperature at ground surface                 0.67232219
## Temperature above ground                      0.76276989
## Temperature (participants)                    1.00000000
## Impervious surface coverage                   0.05441409
## Soil moisture                                 0.25264427
## Experiment start time                         0.48117322
## Date of experiment                            0.47807917
## Cloud cover (participants)                   -0.20012325
## Shade (participants)                          0.03623059
## Wind (participants)                          -0.13054458
## Rain (participants)                          -0.09839737
## Year                                          0.19799213
## City                                         -0.09189725
##                               Impervious surface coverage Soil moisture
## Temperature below ground                      -0.06592400    0.11655318
## Temperature at ground surface                 -0.08762115    0.21644768
## Temperature above ground                      -0.06606175    0.29242172
## Temperature (participants)                     0.05441409    0.25264427
## Impervious surface coverage                    1.00000000   -0.14347183
## Soil moisture                                 -0.14347183    1.00000000
## Experiment start time                         -0.05024439    0.25020451
## Date of experiment                            -0.10377918    0.39844625
## Cloud cover (participants)                     0.06855137    0.03551604
## Shade (participants)                           0.12346464   -0.02065546
## Wind (participants)                           -0.03088054   -0.07286811
## Rain (participants)                            0.04835307   -0.13702879
## Year                                          -0.24167275    0.49475040
## City                                           0.23016342   -0.18950715
##                               Experiment start time Date of experiment
## Temperature below ground                 0.42729222          0.4841802
## Temperature at ground surface            0.29861318          0.5006014
## Temperature above ground                 0.33550292          0.5726075
## Temperature (participants)               0.48117322          0.4780792
## Impervious surface coverage             -0.05024439         -0.1037792
## Soil moisture                            0.25020451          0.3984462
## Experiment start time                    1.00000000          0.2283132
## Date of experiment                       0.22831317          1.0000000
## Cloud cover (participants)               0.21900567         -0.1270443
## Shade (participants)                    -0.06504575         -0.1368369
## Wind (participants)                      0.11053493         -0.1000801
## Rain (participants)                     -0.08333678          0.1544130
## Year                                     0.30444049          0.6727533
## City                                    -0.17880359         -0.2211489
##                               Cloud cover (participants) Shade (participants)
## Temperature below ground                    -0.084554774         -0.453248893
## Temperature at ground surface               -0.310017741         -0.419830298
## Temperature above ground                    -0.317933767         -0.324688725
## Temperature (participants)                  -0.200123247          0.036230586
## Impervious surface coverage                  0.068551366          0.123464644
## Soil moisture                                0.035516039         -0.020655462
## Experiment start time                        0.219005670         -0.065045753
## Date of experiment                          -0.127044301         -0.136836908
## Cloud cover (participants)                   1.000000000          0.009229617
## Shade (participants)                         0.009229617          1.000000000
## Wind (participants)                          0.184981936         -0.102597542
## Rain (participants)                          0.274239271         -0.037687663
## Year                                        -0.018226670         -0.241978620
## City                                        -0.152549162          0.106695073
##                               Wind (participants) Rain (participants)
## Temperature below ground               0.05391416          0.04470452
## Temperature at ground surface         -0.06590040         -0.07483132
## Temperature above ground              -0.09947974         -0.09013773
## Temperature (participants)            -0.13054458         -0.09839737
## Impervious surface coverage           -0.03088054          0.04835307
## Soil moisture                         -0.07286811         -0.13702879
## Experiment start time                  0.11053493         -0.08333678
## Date of experiment                    -0.10008007          0.15441302
## Cloud cover (participants)             0.18498194          0.27423927
## Shade (participants)                  -0.10259754         -0.03768766
## Wind (participants)                    1.00000000          0.03047381
## Rain (participants)                    0.03047381          1.00000000
## Year                                   0.05421455          0.05657501
## City                                  -0.15087292         -0.19823003
##                                      Year        City
## Temperature below ground       0.36997778 -0.22694091
## Temperature at ground surface  0.37967491 -0.15631718
## Temperature above ground       0.39127259 -0.12935776
## Temperature (participants)     0.19799213 -0.09189725
## Impervious surface coverage   -0.24167275  0.23016342
## Soil moisture                  0.49475040 -0.18950715
## Experiment start time          0.30444049 -0.17880359
## Date of experiment             0.67275329 -0.22114893
## Cloud cover (participants)    -0.01822667 -0.15254916
## Shade (participants)          -0.24197862  0.10669507
## Wind (participants)            0.05421455 -0.15087292
## Rain (participants)            0.05657501 -0.19823003
## Year                           1.00000000 -0.42821268
## City                          -0.42821268  1.00000000
# extract the p-values
tmp_env_p_all_matrix <- tmp_env_cor_all_results$P

# plot correlation matrix (only insignificant ones hidden)
corrplot(tmp_env_cor_all_matrix, 
         type = "upper",
         method = "color",
         diag = F,                         # remove diagonal
         #addCoef.col = "black",           # add coefficient coeffs, not used 
         p.mat = tmp_env_p_all_matrix,     # add p-values matrix
         sig.level = c(0.001, 0.01, 0.05), # significance thresholds
         tl.col = "black",                 # color of variable labels
         cl.align.text = "l",              # alignment of color legend
         cl.offset = 0.3,                  # offset color legend text to the right
         number.cex = 0.6,                 # size of coefficient text
         tl.cex = 0.7,                     # size of variable labels
         insig = "label_sig",              # add start according to significance thresholds
         pch.cex = 1)                      # size of stars

# save the plot as a png, commented out for knitting of the rmarkdown file
# dev.copy(png, "../images/data_exploration/correlation_plot_all.png", width = 1600, height = 1600, res = 300)
# dev.off()

All temperature measures are highly correlated (r > 0.5), which is a good control and suggests that there are no significant discrepancies in the temperature measures since they align well. For analysis, the ground temperature will be used as the sole temperature variable as it is the measure that should best reflect the temperature experienced by foraging ants.

#create a data frame with only the numerical variables
tmp_cor_plot_env_t2 <- cut_env_data[,c("t2_mean", "imp_mean", "vol_moist_mean", "minutes_since_8am", 
                                        "relative_day", "clouds_numeric", "shade_numeric", 
                                        "wind_numeric", "rain_numeric",
                                        "year", "city")]

tmp_cor_plot_env_t2 <- tmp_cor_plot_env_t2 %>%
  # Code the city variable as numbers
  mutate(city = dplyr::recode(city, "Halle" = 1, "Leipzig" = 2, "Berlin" = 3)) %>%
  # Remove NAs
  na.omit() %>%
  # Rename variables for plotting
  rename(
    "Temperature at ground surface" = t2_mean,
    "Impervious surface coverage" = imp_mean,
    "Soil moisture" = vol_moist_mean,
    "Experiment start time" = minutes_since_8am,
    "Date of experiment" = relative_day,
    "Year" = year,
    "Cloud cover (participants)" = clouds_numeric,
    "Shade (participants)" = shade_numeric,
    "Wind (participants)" = wind_numeric,
    "Rain (participants)" = rain_numeric,
    "City" = city)

# compute the correlation matrix with p-values
tmp_env_cor_t2_results <- rcorr(as.matrix(tmp_cor_plot_env_t2), type = "spearman") 

# extract correlation matrix and print
tmp_env_cor_t2_matrix <- tmp_env_cor_t2_results$r; tmp_env_cor_t2_matrix
##                               Temperature at ground surface
## Temperature at ground surface                    1.00000000
## Impervious surface coverage                     -0.07394311
## Soil moisture                                    0.22230648
## Experiment start time                            0.31547805
## Date of experiment                               0.48487924
## Cloud cover (participants)                      -0.29627708
## Shade (participants)                            -0.42511345
## Wind (participants)                             -0.05030683
## Rain (participants)                             -0.07351319
## Year                                             0.37590084
## City                                            -0.16435596
##                               Impervious surface coverage Soil moisture
## Temperature at ground surface                 -0.07394311    0.22230648
## Impervious surface coverage                    1.00000000   -0.15129769
## Soil moisture                                 -0.15129769    1.00000000
## Experiment start time                         -0.04392663    0.25843734
## Date of experiment                            -0.11530860    0.41139718
## Cloud cover (participants)                     0.06500863    0.04021584
## Shade (participants)                           0.12149443   -0.03390096
## Wind (participants)                           -0.02918913   -0.06670234
## Rain (participants)                            0.04500706   -0.13194680
## Year                                          -0.24325424    0.50272161
## City                                           0.23700457   -0.20879219
##                               Experiment start time Date of experiment
## Temperature at ground surface            0.31547805         0.48487924
## Impervious surface coverage             -0.04392663        -0.11530860
## Soil moisture                            0.25843734         0.41139718
## Experiment start time                    1.00000000         0.22775770
## Date of experiment                       0.22775770         1.00000000
## Cloud cover (participants)               0.22448854        -0.12096476
## Shade (participants)                    -0.08029459        -0.14166884
## Wind (participants)                      0.11922796        -0.09291709
## Rain (participants)                     -0.08081920         0.15634744
## Year                                     0.30949044         0.67584872
## City                                    -0.19153845        -0.23546431
##                               Cloud cover (participants) Shade (participants)
## Temperature at ground surface               -0.296277077         -0.425113446
## Impervious surface coverage                  0.065008632          0.121494435
## Soil moisture                                0.040215842         -0.033900960
## Experiment start time                        0.224488536         -0.080294586
## Date of experiment                          -0.120964762         -0.141668836
## Cloud cover (participants)                   1.000000000          0.007338021
## Shade (participants)                         0.007338021          1.000000000
## Wind (participants)                          0.179994439         -0.112799893
## Rain (participants)                          0.273546537         -0.038069826
## Year                                        -0.016298624         -0.254472564
## City                                        -0.157434988          0.122652798
##                               Wind (participants) Rain (participants)
## Temperature at ground surface         -0.05030683         -0.07351319
## Impervious surface coverage           -0.02918913          0.04500706
## Soil moisture                         -0.06670234         -0.13194680
## Experiment start time                  0.11922796         -0.08081920
## Date of experiment                    -0.09291709          0.15634744
## Cloud cover (participants)             0.17999444          0.27354654
## Shade (participants)                  -0.11279989         -0.03806983
## Wind (participants)                    1.00000000          0.03043765
## Rain (participants)                    0.03043765          1.00000000
## Year                                   0.06086049          0.05864652
## City                                  -0.15213195         -0.19803643
##                                      Year       City
## Temperature at ground surface  0.37590084 -0.1643560
## Impervious surface coverage   -0.24325424  0.2370046
## Soil moisture                  0.50272161 -0.2087922
## Experiment start time          0.30949044 -0.1915384
## Date of experiment             0.67584872 -0.2354643
## Cloud cover (participants)    -0.01629862 -0.1574350
## Shade (participants)          -0.25447256  0.1226528
## Wind (participants)            0.06086049 -0.1521320
## Rain (participants)            0.05864652 -0.1980364
## Year                           1.00000000 -0.4438117
## City                          -0.44381173  1.0000000
# extract the p-values
tmp_env_p_t2_matrix <- tmp_env_cor_t2_results$P

# plot correlation matrix (only insignificant ones hidden)
corrplot(tmp_env_cor_t2_matrix, 
         type = "upper",
         method = "color",
         diag = F,                         # remove diagonal
         #addCoef.col = "black",           # add coefficient coeffs, not used
         p.mat = tmp_env_p_t2_matrix,      # add p-values matrix
         sig.level = c(0.001, 0.01, 0.05), # significance thresholds
         tl.col = "black",                 # color of variable labels
         cl.align.text = "l",              # alignment of color legend
         cl.offset = 0.3,                  # offset color legend text to the right
         number.cex = 0.6,                 # size of coefficient text
         tl.cex = 0.8,                     # size of variable labels
         cl.cex = 0.7,                     # size of color legend text
         insig = "label_sig",              # add start according to significance thresholds
         pch.cex = 1)                      # size of stars

# save the plot as a png, commented out for knitting of rmarkdown file
# dev.copy(png, "../images/data_exploration/correlation_plot_T2.png", width = 1600, height = 1600, res = 300)
# dev.off()

The results of the correlation analysis show:

  • Year and date of experiment are strongly correlated (r = 0.68), and year and city are moderately correlated (r = -0.44). This is expected as the 2024 experiments were only conducted in Leipzig and over a broader range of dates, while the experiments in 2022 and 2023 were conducted in all 3 cities and usually earlier in the year. Year is also moderately correlated to experiment start time (r = 0.31), soil moisture (r = 0.50), and ground temperature (r = 0.38), which may be due to the fact that experiments in 2024 were more spread out over the time of day and the date than in the previous year. This means the year column may reflect trends in seasonal and daily temperature and moisture variations.

    • This can be accounted for by adding a “site” variable as a random effect in the models. “Site” represents the location or address where experiments were done and allows to group experiments done at the same school or at the same location for the 2024 fieldwork. Site should account for some of the variation both in year and city as different sites are from different cities and were used in different years.
  • Shade and temperature are moderately correlated (r = -0.43), which makes sense since shade lowers ground temperature. The shade variable could be an indirect indication of habitat structure around the baits (i.e., more or less trees around the experiment) or other microclimate characteristics, but it may also provide information which is mostly redundant with temperature. Models with and without shade as a fixed effect are compared below to see if keeping shade as a predictor improves the model or not.

  • Date of experiment is moderately correlated with temperature (r = 0.48) and soil moisture (r = 0.41), which likely reflects seasonal variations in temperature and weather.

  • Experiment start time is moderately correlated with temperature (r = 0.32), which reflects daily variations in temperature.

  • Cloud cover is moderately correlated with temperature (r = -0.30), which is to be expected as more cloud cover can have a negative impact on ground temperature.

  • Other variables are either weakly (r < 0.3) or non-significantly correlated.

    • Impervious surface cover is only weakly negatively correlated with soil moisture (r = -0.15) and with year (r = -0.24) and city (r = 0.24). The correlation with year might be due to the uneven sampling of the three cities over the 3 years. The positive (although weak) correlation with city suggests that impervious surface cover at a local level (100m around experiments) is higher in bigger cities.
    • Cloud cover and rain are weakly correlated (r = 0.27), which is relatively logical.
    • Rain also correlates weakly with experiment date (r = 0.16), which is likely due to the fact that several experiments were done on one day, so all experiments done in the same location on the same day will share rain presence or absence, while cloud cover or wind conditions may be more subjective.

A Variance Inflation Factor test can also be run on the variables (city and year removed since they will not be used as fixed effects) and shows that all variables have a VIF < 5, suggesting no significant multicollinearity.

tmp_vif_model <- lm(scaled_species_total ~ t2_mean + imp_mean + vol_moist_mean + 
                                       minutes_since_8am + relative_day +
                                       clouds_numeric + wind_numeric + 
                                       shade_numeric + rain_numeric, 
                data = env_analysis)

vif(tmp_vif_model)  # Run VIF test
##           t2_mean          imp_mean    vol_moist_mean minutes_since_8am 
##          1.961449          1.061534          1.264533          1.286789 
##      relative_day    clouds_numeric      wind_numeric     shade_numeric 
##          1.689057          1.445597          1.137733          1.293400 
##      rain_numeric 
##          1.286647
rm(list = ls(pattern = "^tmp_"))

1.1 Defining predictor variables for analysis

Based on the correlation analysis, I can define which variables are used as predictors (fixed effects) or random effects.

  • Response variable (reminder):
    • relative number of ants collected at the end of each ant picnic experiment per species (number of ants of species i collected at the end of experiment j offset by the maximum number of ants of species found on any experiment), quantitative
  • Predictor variables:
    • Temperature
      • Mean ground temperature recorded by climate loggers over the experiment time, continuous numerical variable
    • Soil moisture
      • Mean calibrated (volumetric) soil moisture recorded by climate loggers over the experiment time, continuous numerical variable
    • Impervious surface coverage
      • Mean impervious surface coverage in a 100m radius around each experiment site, continuous numerical variable
    • Experiment start time
      • Time of day of the start of the experiment, with the start time recorded as minutes since 8 am, discrete numerical variable
    • Date of experiment
      • Date (day and month) of the experiment, recorded as a relative date (earliest experiment in the year = 1), discrete numerical variable
    • Cloud cover
      • Reported cloud conditions recorded by participants, ordinal categorical variable, encoded from 1 to 3 (low to high)
    • Wind
      • Wind conditions as recorded by participants, ordinal categorical variable, encoded from 1 to 3 (low to high)
    • Shade
      • Shade as recorded by participants, ordinal categorical variable, encoded from 1 to 3 (low to high)
  • Random effects:
    • Site, nominal categorical variable
    • Species, nominal categorical variable

While rain would be an interesting predictor to measure, I am not going to include the absence or presence of rain as a predictor variable because its distribution is very unbalanced in my data: out of 239 experiments, only 7 experiments had rain while 229 did not.

table(cut_env_data$rain)
## 
## no rain    rain 
##     229       7

All other predictor variables than rain will be added into the initial model, but several variable combinations will be tested to find the best model fit.

1.2 Exploratory Data Analysis

This section details different visualizations of the data.

1.2.1 Visualizing response variable
# distribution of raw ant totals
ggplot(na.omit(env_analysis), aes(x = individuals)) +
  geom_histogram(bins = 15, fill = "skyblue", color = "black") +
  labs(x = "Raw number of ants per species and experiment", y = "Frequency")+
  theme(plot.title = element_text(hjust = 0.5))+
  theme_bw()

# distribution of scaled ant totals
ggplot(na.omit(env_analysis), aes(x = scaled_species_total))+
  geom_histogram(bins = 15, fill = "skyblue", color = "black") +
  labs(x = "Scaled number of ants per species and experiment", y = "Frequency")+
  theme(plot.title = element_text(hjust = 0.5))+
  theme_bw()

1.2.2 Visualizing explanatory variables

I can visualize the distributions of continuous explanatory variables as well as the relationship between the response and each variable.

# histogram of continuous predictors
na.omit(env_analysis) %>%
  gather(key = "variable", value = "value", t2_mean, vol_moist_mean, imp_mean) %>%
  ggplot(aes(x = value)) +
  geom_histogram(fill = "skyblue", color = "black", bins = 20) +
  facet_wrap(~ variable, scales = "free", labeller = labeller(variable = c(
    t2_mean = "Ground Temperature (\u00B0C)",
    vol_moist_mean = "Soil Moisture (%)",
    imp_mean = "Impervious Surface Cover (%)"))) +
  labs(title = "Distributions of Continuous Predictors")+
  theme_bw()

# raw ants vs. ground temperature
tmp_raw_ant_t2_plot <- ggplot(na.omit(env_analysis), aes(x = t2_mean, y = individuals)) + 
  geom_point() +
  geom_smooth(method = "loess", se = FALSE, color = "lightblue") +
  labs(title = "Raw ant count per species \nvs. Ground temperature", 
       x = "Ground temperature (\u00B0C)", y = "Raw number of ants per species")+
  theme_bw()

# Araw nts vs. Soil Moisture
tmp_raw_ant_moist_plot <- ggplot(na.omit(env_analysis), aes(x = vol_moist_mean, y = individuals)) +
  geom_point() +
  geom_smooth(method = "loess", se = FALSE, color = "darkgreen") +
  labs(title = "Raw ant count per species \nvs. Soil moisture", 
       x = "Volumetric soil moisture (%)", y = "Raw number of ants per species")+
  theme_bw()

# raw Ants vs. Impervious Surface
tmp_raw_ant_imp_plot <- ggplot(na.omit(env_analysis), aes(x = imp_mean, y = individuals)) +
  geom_point() +
  geom_smooth(method = "loess", se = FALSE, color = "darkred") +
  labs(title = "Raw ant count per species \nvs. Impervious surface cover", 
       x = "Impervious surface cover (%)", y = "Raw number of ants per species")+
  theme_bw()

# combine into 1 plot
(tmp_raw_ant_contpred_plot <- tmp_raw_ant_t2_plot + tmp_raw_ant_moist_plot + 
    tmp_raw_ant_imp_plot + plot_layout(nrow = 1))  # Arrange in 1 row
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# scaled ants vs. ground temperature
tmp_scaled_ant_t2_plot <- ggplot(na.omit(env_analysis), aes(x = t2_mean, y = scaled_species_total)) + 
  geom_point() +
  geom_smooth(method = "loess", se = FALSE, color = "lightblue") +
  labs(title = "Scaled ant count per species \nvs. Ground temperature", 
       x = "Ground temperature (\u00B0C)", y = "Scaled number of ants per species")+
  theme_bw()

# scaled Ants vs. Soil Moisture
tmp_scaled_ant_moist_plot <- ggplot(na.omit(env_analysis), aes(x = vol_moist_mean, y = scaled_species_total)) +
  geom_point() +
  geom_smooth(method = "loess", se = FALSE, color = "darkgreen") +
  labs(title = "Scaled ant count per species \nvs. Soil moisture", 
       x = "Volumetric soil moisture (%)", y = "Scaled number of ants per species")+
  theme_bw()

# scaled Ants vs. Impervious Surface
tmp_scaled_ant_imp_plot <- ggplot(na.omit(env_analysis), aes(x = imp_mean, y = scaled_species_total)) +
  geom_point() +
  geom_smooth(method = "loess", se = FALSE, color = "darkred") +
  labs(title = "Scaled ant count per species \nvs. Impervious surface cover", 
       x = "Impervious surface cover (%)", y = "Scaled number of ants per species")+
  theme_bw()

# combine into 1 plot
(tmp_scaled_ant_contpred_plot <- tmp_scaled_ant_t2_plot + tmp_scaled_ant_moist_plot + 
    tmp_scaled_ant_imp_plot + plot_layout(nrow = 1))  # Arrange in 1 row
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# distribution of cloud cover across exp
ggplot(na.omit(cut_env_data), aes(x = factor(clouds, levels = c(
  "no clouds", "lightly cloudy", "very cloudy")))) +
  geom_bar(fill = "lightblue") +
  labs(x = "Cloud cover", y = "Number of experiments") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  theme_bw()

# distribution of wind across exp
ggplot(na.omit(cut_env_data), aes(x = factor(wind, levels = c(
  "no wind", "weak wind", "strong wind")))) +
  geom_bar(fill = "lightblue") +
  labs(x = "Wind", y = "Number of experiments") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  theme_bw()

# distribution of shade across exp
ggplot(na.omit(cut_env_data), aes(x = factor(shade, levels = c(
  "no shade", "half-shade", "full shade")))) +
  geom_bar(fill = "lightblue") +
  labs(x = "Shade", y = "Number of experiments") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  theme_bw()

# distribution of rain across exp
ggplot(na.omit(cut_env_data), aes(x = factor(rain))) +
  geom_bar(fill = "lightblue") +
  labs(x = "Rain", y = "Number of experiments") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  theme_bw()

# scaled ant count vs clouds
ggplot(na.omit(env_analysis), aes(x = clouds, y = scaled_species_total)) +
  geom_boxplot(fill = "lightblue") +
  labs(x = "Cloud cover", y = "Scaled number of ants per species") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# scaled ant count vs wind
ggplot(na.omit(env_analysis), aes(x = wind, y = scaled_species_total)) +
  geom_boxplot(fill = "lightblue") +
  labs(x = "Wind", y = "Scaled number of ants per species") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# scaled ant count vs shade
ggplot(na.omit(env_analysis), aes(x = shade, y = scaled_species_total)) +
  geom_boxplot(fill = "lightblue") +
  labs(x = "Shade", y = "Scaled number of ants per species") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# distribution of experiment date
ggplot(na.omit(env_analysis), aes(x = relative_day)) +
  geom_histogram(bins = 25, fill = "skyblue", color = "black") +
  labs(x = "Relative date of experiment", y = "Frequency")+
  theme(plot.title = element_text(hjust = 0.5))+
  theme_bw()

# distribution of experiment start time
ggplot(na.omit(env_analysis), aes(x = minutes_since_8am)) +
  geom_histogram(bins = 25, fill = "skyblue", color = "black") +
  labs(x = "Minutes since 8 AM", y = "Frequency")+
  theme(plot.title = element_text(hjust = 0.5))+
  theme_bw()

# raw Ants vs. relative day
tmp_raw_ant_day_plot <- ggplot(na.omit(env_analysis), aes(x = relative_day, y = individuals)) +
  geom_point() +
  geom_smooth(method = "loess", se = FALSE, color = "darkred") +
  labs(title = "Raw ant count per species \nvs. Relative date", 
       x = "Relative date", y = "Raw number of ants per species")+
  theme_bw()

# raw Ants vs. time of day
tmp_raw_ant_time_plot <- ggplot(na.omit(env_analysis), aes(x = minutes_since_8am, y = individuals)) +
  geom_point() +
  geom_smooth(method = "loess", se = FALSE, color = "darkgreen") +
  labs(title = "Raw ant count per species \nvs. Time of day", 
       x = "Minutes since 8 AM", y = "Raw number of ants per species")+
  theme_bw()

# combine into 1 plot
(tmp_raw_ant_timedate_plot <- tmp_raw_ant_day_plot + tmp_raw_ant_time_plot + plot_layout(nrow = 1))  # Arrange in 1 row
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# scaled Ants vs. relative day
tmp_scaled_ant_day_plot <- ggplot(na.omit(env_analysis), aes(x = relative_day, y = individuals)) +
  geom_point() +
  geom_smooth(method = "loess", se = FALSE, color = "darkred") +
  labs(title = "Scaled ant count per species \nvs. Relative date", 
       x = "Relative date", y = "Scaled number of ants per species")+
  theme_bw()

# scaled Ants vs. time of day
tmp_scaled_ant_time_plot <- ggplot(na.omit(env_analysis), aes(x = minutes_since_8am, y = individuals)) +
  geom_point() +
  geom_smooth(method = "loess", se = FALSE, color = "darkgreen") +
  labs(title = "Scaled ant count per species \nvs. Time of day", 
       x = "Minutes since 8 AM", y = "Scaled number of ants per species")+
  theme_bw()
# combine into 1 plot
(tmp_scaled_ant_timedate_plot <- tmp_scaled_ant_day_plot + tmp_scaled_ant_time_plot + 
    plot_layout(nrow = 1))  # Arrange in 1 row
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

It is apparent that since my data contains a very high amount of zero values, the plots of the raw and scaled ant counts are not very informative in terms of trends in the data.

1.2.3 Visualizing random effects

I plot the distribution of the amount of ants found per site.

# create df for plots
tmp_site_ind <- left_join(cut_env_data, sp_totals, by = "sample_ID")
tmp_site_counts <- cut_env_data %>%
  count(site) %>%  # Count experiments per site
  left_join(select(cut_env_data, site, city), by = "site") %>% 
  distinct(site, .keep_all = TRUE)  # Keep only one row per site

# distribution of sites
ggplot(na.omit(tmp_site_counts), aes(x = factor(site, levels = unique(env_analysis$site)), y = n, fill = city)) +
  geom_bar(stat = "identity") +  
  scale_fill_paletteer_d("nationalparkcolors::Acadia") +  # Apply Acadia discrete color scale
  coord_flip() +  # Flips the axes for better readability
  labs(title = "Number of Experiments per Site",
       x = "Site", y = "Number of Experiments") +
  theme_classic2() +
  theme(legend.text = element_text(size = 10),  # set legend text
        legend.title = element_text(size = 12, hjust = 0.5, vjust = 2),  # Smaller legend title
        legend.key.size = unit(0.4, "cm"),  # Makes the legend squares smaller
        axis.title.x = element_text(margin = margin(r = 10), size = 12), # adjust distance from axis title to axis, set text size
        axis.title.y = element_text(margin = margin(r = 10), size = 12)) # adjust distance from axis title to axis, center title

# number of ants per site
(tmp_ind_site_plot <- ggplot(na.omit(tmp_site_ind), aes(x=site, y=total_ants_sp, fill = site))+  
    geom_col()+ 
    scale_fill_viridis_d()+
    labs(y = "Raw number of ants") +
    theme_classic()+
    theme(axis.text.x = element_text(angle = 45, hjust=1, size = 8),
          legend.position = "none",
          axis.title.x = element_blank(),  # adjust distance from axis title to axis
          axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5)) # adjust distance from axis title to axis, center title
)

rm(list = ls(pattern = "^tmp_"))

1.3 Model choice and model assumptions

I am working with count data and multiple explanatory variables (both quantitative and categorical), as well as random effects. Additionally, my scaling process for the response variable at species level has added a lot of structural zeros to my data, since for each experiment there are many species which were not found. Thus, a zero-inflated model is necessary.

I try to fit a zero-inflated binomial (ZIB) GLMM with the number of ants of each species on each experiment representing a “success” and the maximum number of ants for each species across experiments - the number of ants of each species on each experiment as a “failure”. I also try to fit a zero-inflated Poisson (ZIP) GLMM with the number of ants of each species on each experiment as the response variable, and the log of the maximum number of ants of each species as an offset.

I first fit a base model with all of the predictor variables described in section 1.1. I fit both a ZIB GLMM and a ZIP GLMM and compare model performance and diagnostic checks to choose the best fitting model.

1.3.1 Model choice and model assumptions

Zero-Inflated Binomial GLMM

# zero-inflated binomial model
tmp_model1_zib <- glmmTMB(cbind(individuals, species_max - individuals) ~ 
                          t2_mean + vol_moist_mean + imp_mean + 
                          minutes_since_8am + relative_day + 
                          clouds_numeric + wind_numeric + shade_numeric + 
                          (1 | site) + (1 | sp_abbr),   # random effects
                          data = env_analysis,
                          ziformula = ~ 1,  # Zero-inflation
                          family = binomial)  # Binomial distribution 

# show anova table, ensure the model has converged without issues
Anova(tmp_model1_zib, type = "II")
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: cbind(individuals, species_max - individuals)
##                     Chisq Df Pr(>Chisq)    
## t2_mean            17.609  1  2.713e-05 ***
## vol_moist_mean     84.427  1  < 2.2e-16 ***
## imp_mean           95.417  1  < 2.2e-16 ***
## minutes_since_8am 189.393  1  < 2.2e-16 ***
## relative_day       29.551  1  5.445e-08 ***
## clouds_numeric    202.968  1  < 2.2e-16 ***
## wind_numeric      171.182  1  < 2.2e-16 ***
## shade_numeric     355.587  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Check for overdispersion, multicollinearity with performance package
check_overdispersion(tmp_model1_zib)  # overdispersion
## # Overdispersion test
## 
##  dispersion ratio = 0.656
##           p-value = 0.576
## No overdispersion detected.
check_collinearity(tmp_model1_zib)    # multicollinearity
## # Check for Multicollinearity
## 
## * conditional component:
## 
## Low Correlation
## 
##               Term  VIF   VIF 95% CI Increased SE Tolerance Tolerance 95% CI
##            t2_mean 2.29 [2.20, 2.40]         1.51      0.44     [0.42, 0.46]
##     vol_moist_mean 1.23 [1.19, 1.28]         1.11      0.81     [0.78, 0.84]
##           imp_mean 1.08 [1.05, 1.12]         1.04      0.93     [0.89, 0.95]
##  minutes_since_8am 1.38 [1.34, 1.44]         1.18      0.72     [0.70, 0.75]
##       relative_day 1.93 [1.85, 2.01]         1.39      0.52     [0.50, 0.54]
##     clouds_numeric 1.48 [1.43, 1.54]         1.22      0.68     [0.65, 0.70]
##       wind_numeric 1.53 [1.47, 1.59]         1.24      0.66     [0.63, 0.68]
##      shade_numeric 1.43 [1.38, 1.48]         1.19      0.70     [0.68, 0.73]
# check models assumptions and residuals with performance package
check_model(tmp_model1_zib, verbose = T)
## `check_outliers()` does not yet support models of class `glmmTMB`.
## Using `ci_type = "gaussian"` because model is not bernoulli.

# simulate residuals and check model assumptions with DHARMa package
tmp_sim_model1 <- simulateResiduals(fittedModel = tmp_model1_zib)
plot(tmp_sim_model1)

# Compute fitted values and Pearson residuals
tmp_fitted_vals <- fitted(tmp_model1_zib)
tmp_residuals <- residuals(tmp_model1_zib, type = "pearson")
# Create binned residuals plot
arm::binnedplot(tmp_fitted_vals, tmp_residuals)

The ZIB-GLMM shows no pattern in the residuals plot from the DHARMa package and the binned plot from the arm package shows no clear trend, although the binned residuals plot from the performance package seems to show a slight downwards trend. There is no sign of overdispersion or high collinearity.

Zero-Inflated Poisson GLMM

# zero-inflated Poisson model 
tmp_model1_zip <- glmmTMB(individuals ~ t2_mean + vol_moist_mean + imp_mean + 
                           minutes_since_8am + relative_day + 
                           clouds_numeric + wind_numeric + shade_numeric + 
                           (1 | site) + (1 | sp_abbr),  # random effects
                      offset = log(species_max),   
                      ziformula = ~1,      # zero-inflation
                      family = poisson,   # poisson distribution
                      data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
# show anova table, ensure model converged without issues
Anova(tmp_model1_zip, type = "II")
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: individuals
##                      Chisq Df Pr(>Chisq)    
## t2_mean             2.5383  1     0.1111    
## vol_moist_mean    120.8991  1  < 2.2e-16 ***
## imp_mean           84.5972  1  < 2.2e-16 ***
## minutes_since_8am 125.0182  1  < 2.2e-16 ***
## relative_day       35.1442  1  3.062e-09 ***
## clouds_numeric     79.8822  1  < 2.2e-16 ***
## wind_numeric       48.0360  1  4.185e-12 ***
## shade_numeric     196.3327  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Check for overdispersion, multicollinearity with performance package
check_overdispersion(tmp_model1_zip)  # overdispersion
## # Overdispersion test
## 
##  dispersion ratio = 0.000
##           p-value = 0.088
## No overdispersion detected.
check_collinearity(tmp_model1_zip)    # multicollinearity
## # Check for Multicollinearity
## 
## * conditional component:
## 
## Low Correlation
## 
##               Term  VIF   VIF 95% CI Increased SE Tolerance Tolerance 95% CI
##            t2_mean 2.36 [2.26, 2.48]         1.54      0.42     [0.40, 0.44]
##     vol_moist_mean 1.24 [1.20, 1.29]         1.12      0.80     [0.77, 0.83]
##           imp_mean 1.06 [1.04, 1.11]         1.03      0.94     [0.90, 0.96]
##  minutes_since_8am 1.38 [1.33, 1.43]         1.17      0.73     [0.70, 0.75]
##       relative_day 2.14 [2.05, 2.24]         1.46      0.47     [0.45, 0.49]
##     clouds_numeric 1.38 [1.33, 1.44]         1.18      0.72     [0.70, 0.75]
##       wind_numeric 1.58 [1.52, 1.65]         1.26      0.63     [0.61, 0.66]
##      shade_numeric 1.44 [1.39, 1.50]         1.20      0.70     [0.67, 0.72]
# check models assumptions and residuals with performance package
check_model(tmp_model1_zip, verbose = T)
## `check_outliers()` does not yet support models of class `glmmTMB`.

# Compute fitted values and Pearson residuals
tmp_fitted_vals <- fitted(tmp_model1_zip)
tmp_residuals <- residuals(tmp_model1_zip, type = "pearson")
# Create binned residuals plot
arm::binnedplot(tmp_fitted_vals, tmp_residuals)

# simulate residuals and check model assumptions with DHARMa package
tmp_sim_model1 <- simulateResiduals(fittedModel = tmp_model1_zip)
plot(tmp_sim_model1)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

The ZIP-GLMM shows no pattern in the residuals plot from the DHARMa package and the binned plot from the arm package shows no clear trend. The plots and tests from the performance package show no sign of overdispersion or high collinearity.

compare_performance(tmp_model1_zib, tmp_model1_zip)
## When comparing models, please note that probably not all models were fit
##   from same data.
## # Comparison of Model Performance Indices
## 
## Name           |   Model |  AIC (weights) | AICc (weights) |  BIC (weights)
## ---------------------------------------------------------------------------
## tmp_model1_zib | glmmTMB | 9904.4 (<.001) | 9904.5 (<.001) | 9982.4 (<.001)
## tmp_model1_zip | glmmTMB | 6526.9 (>.999) | 6527.0 (>.999) | 6603.6 (>.999)
## 
## Name           | R2 (cond.) | R2 (marg.) |   ICC |   RMSE | Sigma | Log_loss
## ----------------------------------------------------------------------------
## tmp_model1_zib |      0.997 |      0.234 | 0.996 |  0.089 | 1.000 |         
## tmp_model1_zip |      0.565 |      0.133 | 0.498 | 14.683 | 1.000 |         
## 
## Name           | Score_log | Score_spherical
## --------------------------------------------
## tmp_model1_zib |           |                
## tmp_model1_zip |    -0.705 |           0.015

Comparing the model fit of both models shows that the ZIP-GLMM performs much better with a significantly lower AIC, AICc, and BIC. the ZIB-GLMM explains more of the overall variance (both higher conditional R2 and marginal R2), but the ZIB model has an extremely high ICC (0.996), suggesting nearly all of the variance is explained by the random effects. The ZIP model has a lower explanatory power but much lower ICC (0.498), meaning the fixed effects contribute more to the explanation of overall variance. The ZIP model has higher model error (RMSE = 14.683), meaning the predictions are less precise.

Overall, the zero-inflated Poisson GLMM performs better than the zero-inflated binomial GLMM. The rest of the analysis will be conducted with the ZIP-GLMM.

Categorical variables

Now that I have fit the base model, I can test whether the model performs better with the categorical variables as factors or numerically encoded. I fit a model with the categorical variables as factors instead of numerically encoded variables and compare the performance with the base model.

# fit ZIP with categorical predictors as factors
tmp_model1_cat <- glmmTMB(individuals ~ t2_mean + vol_moist_mean + imp_mean + 
                           minutes_since_8am + relative_day + 
                           clouds + wind + shade +     # categorical predictors as factors
                           (1 | site) + (1 | sp_abbr),  # random effects
                      offset = log(species_max),   
                      ziformula = ~1,      # zero-inflation
                      family = poisson,   # poisson distribution
                      data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
# show summary table to ensure the model has converged (no warnings)
Anova(tmp_model1_cat, type = "II")
## Warning in printHypothesis(L, rhs, names(b)): one or more coefficients in the hypothesis include
##      arithmetic operators in their names;
##   the printed representation of the hypothesis will be omitted
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: individuals
##                      Chisq Df Pr(>Chisq)    
## t2_mean             1.1605  1   0.281354    
## vol_moist_mean      6.8311  1   0.008958 ** 
## imp_mean           76.4498  1  < 2.2e-16 ***
## minutes_since_8am 308.2892  1  < 2.2e-16 ***
## relative_day      158.7223  1  < 2.2e-16 ***
## clouds            394.1585  2  < 2.2e-16 ***
## wind              521.0222  2  < 2.2e-16 ***
## shade             401.0190  2  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(tmp_model1_cat)
##  Family: poisson  ( log )
## Formula:          
## individuals ~ t2_mean + vol_moist_mean + imp_mean + minutes_since_8am +  
##     relative_day + clouds + wind + shade + (1 | site) + (1 |      sp_abbr)
## Zero inflation:               ~1
## Data: na.omit(env_analysis)
##  Offset: log(species_max)
## 
##      AIC      BIC   logLik deviance df.resid 
##   5553.5   5649.4  -2761.7   5523.5     4413 
## 
## Random effects:
## 
## Conditional model:
##  Groups  Name        Variance Std.Dev.
##  site    (Intercept) 7.653    2.766   
##  sp_abbr (Intercept) 2.086    1.444   
## Number of obs: 4428, groups:  site, 33; sp_abbr, 27
## 
## Conditional model:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           1.2120971  0.6632140   1.828  0.06761 .  
## t2_mean               0.0089264  0.0082861   1.077  0.28135    
## vol_moist_mean        0.0098764  0.0037788   2.614  0.00896 ** 
## imp_mean             -0.0421326  0.0048187  -8.744  < 2e-16 ***
## minutes_since_8am     0.0034950  0.0001991  17.558  < 2e-16 ***
## relative_day         -0.0219700  0.0017439 -12.599  < 2e-16 ***
## cloudslightly cloudy -1.9000674  0.0969210 -19.604  < 2e-16 ***
## cloudsvery cloudy    -1.9791487  0.1072728 -18.450  < 2e-16 ***
## windweak wind        -1.0008130  0.0841000 -11.900  < 2e-16 ***
## windstrong wind       0.3209682  0.1034862   3.102  0.00193 ** 
## shadehalf-shade      -1.2866773  0.0658158 -19.550  < 2e-16 ***
## shadefull shade      -1.2112848  0.0829355 -14.605  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Zero-inflation model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.35310    0.08306   28.33   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Check for overdispersion with performance package
check_overdispersion(tmp_model1_cat)  
## # Overdispersion test
## 
##  dispersion ratio = 0.000
##           p-value = 0.016
## Underdispersion detected.
# check models assumptions and residuals with performance package
check_model(tmp_model1_cat, verbose = T)
## `check_outliers()` does not yet support models of class `glmmTMB`.

# Compute fitted values and Pearson residuals
tmp_fitted_vals <- fitted(tmp_model1_cat)
tmp_residuals <- residuals(tmp_model1_cat, type = "pearson")
# Create binned residuals plot
arm::binnedplot(tmp_fitted_vals, tmp_residuals)

# simulate residuals and check model assumptions with DHARMa package
tmp_sim_model1 <- simulateResiduals(fittedModel = tmp_model1_cat)
plot(tmp_sim_model1)

# compare performance of base model and model with categorical var as factors
compare_performance(tmp_model1_zip, tmp_model1_cat) # base model does better
## # Comparison of Model Performance Indices
## 
## Name           |   Model |  AIC (weights) | AICc (weights) |  BIC (weights)
## ---------------------------------------------------------------------------
## tmp_model1_zip | glmmTMB | 6526.9 (<.001) | 6527.0 (<.001) | 6603.6 (<.001)
## tmp_model1_cat | glmmTMB | 5553.5 (>.999) | 5553.6 (>.999) | 5649.4 (>.999)
## 
## Name           | R2 (cond.) | R2 (marg.) |   ICC |   RMSE | Sigma | Score_log | Score_spherical
## -----------------------------------------------------------------------------------------------
## tmp_model1_zip |      0.565 |      0.133 | 0.498 | 14.683 | 1.000 |    -0.705 |           0.015
## tmp_model1_cat |      0.521 |      0.135 | 0.446 | 97.291 | 1.000 |    -0.591 |           0.015

The ZIP model with the categorical variables as factors shows no pattern in the residuals plot from the DHARMa package and the Q-Q plot is well aligned with the 1:1 line. The binned plot from the arm package shows no clear trend. The plots and tests from the performance package show no sign high collinearity, but a slightly significant underdispersion (p = 0.016). Since the Q-Q plot and residuals plots look fine and the overdispersion test is not highly significant, this should not be a large issue.

Comparing model performance show that the categorical model performs much better in terms of AIC, AICc and BIC than the base model. I continue the analysis with the categorical model.

Interactions

It is possible that some interactions are at play, so I try different model configurations with interactions.

  • Interaction between temperature and impervious surface cover: the urban heat island effect is a well-documented process happening in urban areas, so it is possible that the effect of temperature on ant foraging depends on urbanization, characterized here as impervious surface cover.
  • Interaction between temperature and date: date of experiment is an indirect way of looking at seasonal variation, and temperature fluctuates seasonally. It is possible that species might respond to temperature differently depending on the time of year, e.g. more active when it is warmer in spring but less active when it is too warm in July to avoid heat stress.
  • Interaction between temperature and time of day: similar idea to date, where there are daily variations in temperature. Temperature may influence ant foraging differently throughout the day, e.g. positive influence in the morning as temperature rises but negative influence at the hottest times of the day due to heat stress.
# model interaction between temperature and impervious surface 
tmp_model1_int_imp <- glmmTMB(individuals ~ t2_mean * imp_mean + vol_moist_mean + 
                              minutes_since_8am + relative_day +   
                              clouds + wind + shade +
                              (1 | site) + (1 | sp_abbr),
                      offset = log(species_max),   
                      ziformula = ~1,      # zero-inflation
                      family = poisson,   # poisson distribution
                      data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
check_collinearity(tmp_model1_int_imp)
## Model has interaction terms. VIFs might be inflated.
##   Try to center the variables used for the interaction, or check
##   multicollinearity among predictors of a model without interaction terms.
## # Check for Multicollinearity
## 
## * conditional component:
## 
## Low Correlation
## 
##               Term  VIF     VIF 95% CI Increased SE Tolerance Tolerance 95% CI
##           imp_mean 3.68 [ 3.50,  3.87]         1.92      0.27     [0.26, 0.29]
##     vol_moist_mean 1.45 [ 1.40,  1.51]         1.20      0.69     [0.66, 0.72]
##  minutes_since_8am 3.99 [ 3.79,  4.20]         2.00      0.25     [0.24, 0.26]
##       relative_day 2.76 [ 2.63,  2.90]         1.66      0.36     [0.35, 0.38]
##             clouds 2.95 [ 2.81,  3.10]         1.72      0.34     [0.32, 0.36]
##               wind 2.86 [ 2.73,  3.01]         1.69      0.35     [0.33, 0.37]
##              shade 2.15 [ 2.06,  2.25]         1.47      0.47     [0.44, 0.49]
## 
## Moderate Correlation
## 
##     Term  VIF     VIF 95% CI Increased SE Tolerance Tolerance 95% CI
##  t2_mean 8.06 [ 7.63,  8.51]         2.84      0.12     [0.12, 0.13]
## 
## High Correlation
## 
##              Term   VIF     VIF 95% CI Increased SE Tolerance Tolerance 95% CI
##  t2_mean:imp_mean 11.21 [10.60, 11.86]         3.35      0.09     [0.08, 0.09]
# model interaction between temperature and date 
tmp_model1_int_date <- glmmTMB(individuals ~ t2_mean * relative_day + vol_moist_mean + 
                              minutes_since_8am + imp_mean +  
                              clouds + wind + shade +
                              (1 | site) + (1 | sp_abbr),
                      offset = log(species_max),   
                      ziformula = ~1,      # zero-inflation
                      family = poisson,   # poisson distribution
                      data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
check_collinearity(tmp_model1_int_date)
## Model has interaction terms. VIFs might be inflated.
##   Try to center the variables used for the interaction, or check
##   multicollinearity among predictors of a model without interaction terms.
## # Check for Multicollinearity
## 
## * conditional component:
## 
## Low Correlation
## 
##               Term  VIF       VIF 95% CI Increased SE Tolerance
##     vol_moist_mean 2.18 [  2.09,   2.28]         1.48      0.46
##  minutes_since_8am 2.07 [  1.98,   2.16]         1.44      0.48
##           imp_mean 1.19 [  1.16,   1.24]         1.09      0.84
##             clouds 4.70 [  4.46,   4.96]         2.17      0.21
##               wind 3.79 [  3.60,   3.99]         1.95      0.26
##              shade 2.28 [  2.18,   2.38]         1.51      0.44
##  Tolerance 95% CI
##      [0.44, 0.48]
##      [0.46, 0.50]
##      [0.81, 0.87]
##      [0.20, 0.22]
##      [0.25, 0.28]
##      [0.42, 0.46]
## 
## High Correlation
## 
##                  Term    VIF       VIF 95% CI Increased SE Tolerance
##               t2_mean  51.98 [ 49.05,  55.10]         7.21      0.02
##          relative_day 183.19 [172.78, 194.23]        13.53  5.46e-03
##  t2_mean:relative_day 349.97 [330.05, 371.09]        18.71  2.86e-03
##  Tolerance 95% CI
##      [0.02, 0.02]
##      [0.01, 0.01]
##      [0.00, 0.00]
# model interaction between temperature and time of day 
tmp_model1_int_time <- glmmTMB(individuals ~ t2_mean * minutes_since_8am + vol_moist_mean + 
                              relative_day + imp_mean +   
                              clouds + wind + shade +
                              (1 | site) + (1 | sp_abbr),
                      offset = log(species_max),   
                      ziformula = ~1,      # zero-inflation
                      family = poisson,   # poisson distribution
                      data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
check_collinearity(tmp_model1_int_time)
## Model has interaction terms. VIFs might be inflated.
##   Try to center the variables used for the interaction, or check
##   multicollinearity among predictors of a model without interaction terms.
## # Check for Multicollinearity
## 
## * conditional component:
## 
## Low Correlation
## 
##            Term  VIF       VIF 95% CI Increased SE Tolerance Tolerance 95% CI
##  vol_moist_mean 1.52 [  1.47,   1.59]         1.23      0.66     [0.63, 0.68]
##    relative_day 2.82 [  2.69,   2.96]         1.68      0.35     [0.34, 0.37]
##        imp_mean 1.13 [  1.10,   1.17]         1.06      0.88     [0.85, 0.91]
##          clouds 3.62 [  3.44,   3.80]         1.90      0.28     [0.26, 0.29]
##            wind 2.90 [  2.76,   3.05]         1.70      0.34     [0.33, 0.36]
##           shade 2.18 [  2.08,   2.28]         1.48      0.46     [0.44, 0.48]
## 
## High Correlation
## 
##                       Term    VIF       VIF 95% CI Increased SE Tolerance
##                    t2_mean  16.10 [ 15.21,  17.05]         4.01      0.06
##          minutes_since_8am  83.75 [ 79.00,  88.78]         9.15      0.01
##  t2_mean:minutes_since_8am 106.39 [100.36, 112.80]        10.31  9.40e-03
##  Tolerance 95% CI
##      [0.06, 0.07]
##      [0.01, 0.01]
##      [0.01, 0.01]
# model all 3 interactions  
tmp_model1_int_all <- glmmTMB(individuals ~ t2_mean * minutes_since_8am + t2_mean * relative_day +
                                t2_mean * imp_mean + vol_moist_mean + 
                                clouds + wind + shade +
                                (1 | site) + (1 | sp_abbr),
                              offset = log(species_max),   
                              ziformula = ~1,      # zero-inflation
                              family = poisson,   # poisson distribution
                              data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
check_collinearity(tmp_model1_int_all)
## Model has interaction terms. VIFs might be inflated.
##   Try to center the variables used for the interaction, or check
##   multicollinearity among predictors of a model without interaction terms.
## # Check for Multicollinearity
## 
## * conditional component:
## 
## Low Correlation
## 
##            Term  VIF       VIF 95% CI Increased SE Tolerance Tolerance 95% CI
##        imp_mean 3.75 [  3.56,   3.94]         1.94      0.27     [0.25, 0.28]
##  vol_moist_mean 1.67 [  1.60,   1.74]         1.29      0.60     [0.57, 0.62]
##          clouds 4.46 [  4.24,   4.70]         2.11      0.22     [0.21, 0.24]
##            wind 3.19 [  3.04,   3.35]         1.79      0.31     [0.30, 0.33]
##           shade 2.33 [  2.23,   2.44]         1.53      0.43     [0.41, 0.45]
## 
## High Correlation
## 
##                       Term    VIF       VIF 95% CI Increased SE Tolerance
##                    t2_mean  38.88 [ 36.70,  41.20]         6.24      0.03
##          minutes_since_8am  98.99 [ 93.38, 104.94]         9.95      0.01
##               relative_day  86.07 [ 81.20,  91.25]         9.28      0.01
##  t2_mean:minutes_since_8am 119.53 [112.75, 126.72]        10.93  8.37e-03
##       t2_mean:relative_day 170.73 [161.03, 181.01]        13.07  5.86e-03
##           t2_mean:imp_mean  10.78 [ 10.19,  11.40]         3.28      0.09
##  Tolerance 95% CI
##      [0.02, 0.03]
##      [0.01, 0.01]
##      [0.01, 0.01]
##      [0.01, 0.01]
##      [0.01, 0.01]
##      [0.09, 0.10]
# model both imperivous surface and date interactions  
tmp_model1_int_impday <- glmmTMB(individuals ~ t2_mean * relative_day +
                                    t2_mean * imp_mean + vol_moist_mean + 
                              clouds + wind + shade +
                              (1 | site) + (1 | sp_abbr),
                      offset = log(species_max),   
                      ziformula = ~1,      # zero-inflation
                      family = poisson,   # poisson distribution
                      data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
check_collinearity(tmp_model1_int_impday)
## Model has interaction terms. VIFs might be inflated.
##   Try to center the variables used for the interaction, or check
##   multicollinearity among predictors of a model without interaction terms.
## # Check for Multicollinearity
## 
## * conditional component:
## 
## Low Correlation
## 
##            Term  VIF       VIF 95% CI Increased SE Tolerance Tolerance 95% CI
##        imp_mean 2.62 [  2.50,   2.75]         1.62      0.38     [0.36, 0.40]
##  vol_moist_mean 1.79 [  1.72,   1.87]         1.34      0.56     [0.53, 0.58]
##          clouds 3.52 [  3.35,   3.71]         1.88      0.28     [0.27, 0.30]
##            wind 3.54 [  3.37,   3.73]         1.88      0.28     [0.27, 0.30]
##           shade 2.11 [  2.02,   2.21]         1.45      0.47     [0.45, 0.49]
## 
## Moderate Correlation
## 
##              Term  VIF       VIF 95% CI Increased SE Tolerance Tolerance 95% CI
##  t2_mean:imp_mean 6.05 [  5.73,   6.38]         2.46      0.17     [0.16, 0.17]
## 
## High Correlation
## 
##                  Term    VIF       VIF 95% CI Increased SE Tolerance
##               t2_mean  27.34 [ 25.81,  28.96]         5.23      0.04
##          relative_day 124.18 [117.13, 131.66]        11.14  8.05e-03
##  t2_mean:relative_day 214.41 [202.22, 227.34]        14.64  4.66e-03
##  Tolerance 95% CI
##      [0.03, 0.04]
##      [0.01, 0.01]
##      [0.00, 0.00]
# compare performance of base model and interactions
compare_performance(tmp_model1_cat, tmp_model1_int_imp, tmp_model1_int_date, tmp_model1_int_time, tmp_model1_int_all, tmp_model1_int_impday) 
## # Comparison of Model Performance Indices
## 
## Name                  |   Model |  AIC (weights) | AICc (weights)
## -----------------------------------------------------------------
## tmp_model1_cat        | glmmTMB | 5553.5 (<.001) | 5553.6 (<.001)
## tmp_model1_int_imp    | glmmTMB | 5512.3 (<.001) | 5512.4 (<.001)
## tmp_model1_int_date   | glmmTMB | 5368.2 (<.001) | 5368.3 (<.001)
## tmp_model1_int_time   | glmmTMB | 5555.5 (<.001) | 5555.6 (<.001)
## tmp_model1_int_all    | glmmTMB | 5352.3 (>.999) | 5352.5 (>.999)
## tmp_model1_int_impday | glmmTMB | 5470.6 (<.001) | 5470.7 (<.001)
## 
## Name                  |  BIC (weights) | R2 (cond.) | R2 (marg.) |   ICC
## ------------------------------------------------------------------------
## tmp_model1_cat        | 5649.4 (<.001) |      0.521 |      0.135 | 0.446
## tmp_model1_int_imp    | 5614.6 (<.001) |      0.530 |      0.133 | 0.458
## tmp_model1_int_date   | 5470.5 (0.176) |      0.664 |      0.186 | 0.587
## tmp_model1_int_time   | 5657.8 (<.001) |      0.521 |      0.135 | 0.446
## tmp_model1_int_all    | 5467.4 (0.824) |      0.639 |      0.197 | 0.550
## tmp_model1_int_impday | 5572.9 (<.001) |      0.606 |      0.155 | 0.534
## 
## Name                  |   RMSE | Sigma | Score_log | Score_spherical
## --------------------------------------------------------------------
## tmp_model1_cat        | 97.291 | 1.000 |    -0.591 |           0.015
## tmp_model1_int_imp    | 37.105 | 1.000 |    -0.587 |           0.015
## tmp_model1_int_date   | 16.454 | 1.000 |    -0.571 |           0.015
## tmp_model1_int_time   | 96.156 | 1.000 |    -0.591 |           0.015
## tmp_model1_int_all    | 16.016 | 1.000 |    -0.570 |           0.015
## tmp_model1_int_impday | 15.130 | 1.000 |    -0.584 |           0.015

Comparing model performance shows that the model including all 3 interaction terms performs significantly better than all other models. However, including interaction terms highly increases multicollinearity and all models show variables with VIFs > 10, which is problematic.

I first try to center all variables included in interaction terms around the variable means and compare the model fits again.

# centering the variables around the mean (centered var = var - mean(var))
env_analysis <- env_analysis %>%
  mutate(
    cent_t2 = scale(t2_mean, center = TRUE, scale = FALSE),  
    cent_imp = scale(imp_mean, center = TRUE, scale = FALSE),
    cent_minutes_since_8am = scale(minutes_since_8am, center = TRUE, scale = FALSE),
    cent_relative_day = scale(relative_day, center = TRUE, scale = FALSE))
# model interaction between temperature and impervious surface 
tmp_model1_int_imp_c <- glmmTMB(individuals ~ cent_t2 * cent_imp + vol_moist_mean + 
                              minutes_since_8am + relative_day +   
                              clouds + wind + shade +
                              (1 | site) + (1 | sp_abbr),
                      offset = log(species_max),   
                      ziformula = ~1,      # zero-inflation
                      family = poisson,   # poisson distribution
                      data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
check_collinearity(tmp_model1_int_imp_c)
## # Check for Multicollinearity
## 
## * conditional component:
## 
## Low Correlation
## 
##               Term  VIF   VIF 95% CI Increased SE Tolerance Tolerance 95% CI
##            cent_t2 2.43 [2.32, 2.55]         1.56      0.41     [0.39, 0.43]
##           cent_imp 1.12 [1.09, 1.17]         1.06      0.89     [0.86, 0.92]
##     vol_moist_mean 1.45 [1.40, 1.51]         1.21      0.69     [0.66, 0.71]
##  minutes_since_8am 3.94 [3.75, 4.15]         1.99      0.25     [0.24, 0.27]
##       relative_day 2.76 [2.63, 2.89]         1.66      0.36     [0.35, 0.38]
##             clouds 2.94 [2.81, 3.09]         1.72      0.34     [0.32, 0.36]
##               wind 2.86 [2.73, 3.01]         1.69      0.35     [0.33, 0.37]
##              shade 2.15 [2.06, 2.25]         1.47      0.47     [0.44, 0.49]
##   cent_t2:cent_imp 3.09 [2.94, 3.25]         1.76      0.32     [0.31, 0.34]
# model interaction between temperature and date 
tmp_model1_int_date_c <- glmmTMB(individuals ~ cent_t2 * cent_relative_day + vol_moist_mean + 
                              minutes_since_8am + imp_mean +  
                              clouds + wind + shade +
                              (1 | site) + (1 | sp_abbr),
                      offset = log(species_max),   
                      ziformula = ~1,      # zero-inflation
                      family = poisson,   # poisson distribution
                      data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
check_collinearity(tmp_model1_int_date_c)
## # Check for Multicollinearity
## 
## * conditional component:
## 
## Low Correlation
## 
##                       Term  VIF   VIF 95% CI Increased SE Tolerance
##                    cent_t2 3.65 [3.47, 3.84]         1.91      0.27
##          cent_relative_day 3.33 [3.17, 3.50]         1.83      0.30
##             vol_moist_mean 1.65 [1.59, 1.72]         1.28      0.61
##          minutes_since_8am 2.05 [1.96, 2.14]         1.43      0.49
##                   imp_mean 1.16 [1.13, 1.20]         1.08      0.86
##                     clouds 3.29 [3.14, 3.46]         1.82      0.30
##                       wind 3.06 [2.92, 3.22]         1.75      0.33
##                      shade 2.26 [2.16, 2.37]         1.50      0.44
##  cent_t2:cent_relative_day 3.82 [3.63, 4.02]         1.95      0.26
##  Tolerance 95% CI
##      [0.26, 0.29]
##      [0.29, 0.32]
##      [0.58, 0.63]
##      [0.47, 0.51]
##      [0.83, 0.89]
##      [0.29, 0.32]
##      [0.31, 0.34]
##      [0.42, 0.46]
##      [0.25, 0.28]
# model interaction between temperature and time of day 
tmp_model1_int_time_c <- glmmTMB(individuals ~ cent_t2 * cent_minutes_since_8am + vol_moist_mean + 
                              relative_day + imp_mean +   
                              clouds + wind + shade +
                              (1 | site) + (1 | sp_abbr),
                      offset = log(species_max),   
                      ziformula = ~1,      # zero-inflation
                      family = poisson,   # poisson distribution
                      data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
check_collinearity(tmp_model1_int_time_c)
## # Check for Multicollinearity
## 
## * conditional component:
## 
## Low Correlation
## 
##                            Term  VIF   VIF 95% CI Increased SE Tolerance
##                         cent_t2 4.21 [4.00, 4.44]         2.05      0.24
##          cent_minutes_since_8am 2.22 [2.13, 2.33]         1.49      0.45
##                  vol_moist_mean 1.51 [1.45, 1.57]         1.23      0.66
##                    relative_day 2.85 [2.72, 3.00]         1.69      0.35
##                        imp_mean 1.13 [1.10, 1.18]         1.06      0.88
##                          clouds 3.70 [3.52, 3.89]         1.92      0.27
##                            wind 2.90 [2.76, 3.05]         1.70      0.34
##                           shade 2.16 [2.07, 2.26]         1.47      0.46
##  cent_t2:cent_minutes_since_8am 3.78 [3.60, 3.98]         1.95      0.26
##  Tolerance 95% CI
##      [0.23, 0.25]
##      [0.43, 0.47]
##      [0.64, 0.69]
##      [0.33, 0.37]
##      [0.85, 0.91]
##      [0.26, 0.28]
##      [0.33, 0.36]
##      [0.44, 0.48]
##      [0.25, 0.28]
# model all 3 interactions  
tmp_model1_int_all_c <- glmmTMB(individuals ~ cent_t2 * cent_minutes_since_8am + cent_t2 * cent_relative_day +
                                cent_t2 * cent_imp + vol_moist_mean + 
                                clouds + wind + shade +
                                (1 | site) + (1 | sp_abbr),
                              offset = log(species_max),   
                              ziformula = ~1,      # zero-inflation
                              family = poisson,   # poisson distribution
                              data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
check_collinearity(tmp_model1_int_all_c)
## # Check for Multicollinearity
## 
## * conditional component:
## 
## Low Correlation
## 
##                            Term  VIF   VIF 95% CI Increased SE Tolerance
##          cent_minutes_since_8am 4.09 [3.89, 4.31]         2.02      0.24
##               cent_relative_day 3.75 [3.57, 3.95]         1.94      0.27
##                        cent_imp 1.16 [1.13, 1.21]         1.08      0.86
##                  vol_moist_mean 1.66 [1.60, 1.74]         1.29      0.60
##                          clouds 4.54 [4.31, 4.78]         2.13      0.22
##                            wind 3.19 [3.04, 3.36]         1.79      0.31
##                           shade 2.33 [2.22, 2.44]         1.53      0.43
##  cent_t2:cent_minutes_since_8am 4.43 [4.21, 4.67]         2.10      0.23
##       cent_t2:cent_relative_day 4.02 [3.82, 4.23]         2.00      0.25
##                cent_t2:cent_imp 3.23 [3.07, 3.39]         1.80      0.31
##  Tolerance 95% CI
##      [0.23, 0.26]
##      [0.25, 0.28]
##      [0.83, 0.89]
##      [0.58, 0.63]
##      [0.21, 0.23]
##      [0.30, 0.33]
##      [0.41, 0.45]
##      [0.21, 0.24]
##      [0.24, 0.26]
##      [0.29, 0.33]
## 
## Moderate Correlation
## 
##     Term  VIF   VIF 95% CI Increased SE Tolerance Tolerance 95% CI
##  cent_t2 5.43 [5.15, 5.73]         2.33      0.18     [0.17, 0.19]
# model both significant interactions  
tmp_model1_int_impday_c <- glmmTMB(individuals ~ cent_t2 * cent_relative_day +
                                    cent_t2 * cent_imp + vol_moist_mean + 
                              clouds + wind + shade +
                              (1 | site) + (1 | sp_abbr),
                      offset = log(species_max),   
                      ziformula = ~1,      # zero-inflation
                      family = poisson,   # poisson distribution
                      data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
check_collinearity(tmp_model1_int_impday_c)
## # Check for Multicollinearity
## 
## * conditional component:
## 
## Low Correlation
## 
##                       Term  VIF   VIF 95% CI Increased SE Tolerance
##                    cent_t2 3.49 [3.33, 3.68]         1.87      0.29
##          cent_relative_day 3.16 [3.00, 3.32]         1.78      0.32
##                   cent_imp 1.16 [1.12, 1.20]         1.08      0.86
##             vol_moist_mean 1.52 [1.46, 1.58]         1.23      0.66
##                     clouds 2.31 [2.21, 2.42]         1.52      0.43
##                       wind 2.78 [2.65, 2.92]         1.67      0.36
##                      shade 2.11 [2.02, 2.21]         1.45      0.47
##  cent_t2:cent_relative_day 3.66 [3.48, 3.85]         1.91      0.27
##           cent_t2:cent_imp 1.58 [1.52, 1.65]         1.26      0.63
##  Tolerance 95% CI
##      [0.27, 0.30]
##      [0.30, 0.33]
##      [0.83, 0.89]
##      [0.63, 0.68]
##      [0.41, 0.45]
##      [0.34, 0.38]
##      [0.45, 0.49]
##      [0.26, 0.29]
##      [0.61, 0.66]
# compare performance of base model and interactions
compare_performance(tmp_model1_cat, tmp_model1_int_imp_c, tmp_model1_int_date_c, tmp_model1_int_time_c, tmp_model1_int_all_c, tmp_model1_int_impday_c) 
## # Comparison of Model Performance Indices
## 
## Name                    |   Model |  AIC (weights) | AICc (weights)
## -------------------------------------------------------------------
## tmp_model1_cat          | glmmTMB | 5553.5 (<.001) | 5553.6 (<.001)
## tmp_model1_int_imp_c    | glmmTMB | 5512.3 (<.001) | 5512.4 (<.001)
## tmp_model1_int_date_c   | glmmTMB | 5368.2 (<.001) | 5368.3 (<.001)
## tmp_model1_int_time_c   | glmmTMB | 5555.5 (<.001) | 5555.6 (<.001)
## tmp_model1_int_all_c    | glmmTMB | 5352.3 (>.999) | 5352.5 (>.999)
## tmp_model1_int_impday_c | glmmTMB | 5470.6 (<.001) | 5470.7 (<.001)
## 
## Name                    |  BIC (weights) | R2 (cond.) | R2 (marg.) |   ICC
## --------------------------------------------------------------------------
## tmp_model1_cat          | 5649.4 (<.001) |      0.521 |      0.135 | 0.446
## tmp_model1_int_imp_c    | 5614.6 (<.001) |      0.530 |      0.133 | 0.458
## tmp_model1_int_date_c   | 5470.5 (0.176) |      0.664 |      0.186 | 0.587
## tmp_model1_int_time_c   | 5657.8 (<.001) |      0.521 |      0.135 | 0.446
## tmp_model1_int_all_c    | 5467.4 (0.824) |      0.639 |      0.197 | 0.550
## tmp_model1_int_impday_c | 5572.9 (<.001) |      0.606 |      0.155 | 0.534
## 
## Name                    |   RMSE | Sigma | Score_log | Score_spherical
## ----------------------------------------------------------------------
## tmp_model1_cat          | 97.291 | 1.000 |    -0.591 |           0.015
## tmp_model1_int_imp_c    | 37.105 | 1.000 |    -0.587 |           0.015
## tmp_model1_int_date_c   | 16.454 | 1.000 |    -0.571 |           0.015
## tmp_model1_int_time_c   | 96.156 | 1.000 |    -0.591 |           0.015
## tmp_model1_int_all_c    | 16.016 | 1.000 |    -0.570 |           0.015
## tmp_model1_int_impday_c | 15.130 | 1.000 |    -0.584 |           0.015

Centering the predictors involved in interactions has fixed the collinearity issues, and all models show predictors with low to moderate correlation. Comparing model performance shows that the model which includes all interaction terms still performs much better than all other models with a much lower AIC, AICc, and BIC.

I can now check model assumptions for the model which includes all interaction terms.

# check assumptions for model with all interaction terms and centered predictors

# Check for overdispersion with performance package
check_overdispersion(tmp_model1_int_all_c)  
## # Overdispersion test
## 
##  dispersion ratio =   0.000
##           p-value = < 0.001
## Underdispersion detected.
check_collinearity(tmp_model1_int_all_c)
## # Check for Multicollinearity
## 
## * conditional component:
## 
## Low Correlation
## 
##                            Term  VIF   VIF 95% CI Increased SE Tolerance
##          cent_minutes_since_8am 4.09 [3.89, 4.31]         2.02      0.24
##               cent_relative_day 3.75 [3.57, 3.95]         1.94      0.27
##                        cent_imp 1.16 [1.13, 1.21]         1.08      0.86
##                  vol_moist_mean 1.66 [1.60, 1.74]         1.29      0.60
##                          clouds 4.54 [4.31, 4.78]         2.13      0.22
##                            wind 3.19 [3.04, 3.36]         1.79      0.31
##                           shade 2.33 [2.22, 2.44]         1.53      0.43
##  cent_t2:cent_minutes_since_8am 4.43 [4.21, 4.67]         2.10      0.23
##       cent_t2:cent_relative_day 4.02 [3.82, 4.23]         2.00      0.25
##                cent_t2:cent_imp 3.23 [3.07, 3.39]         1.80      0.31
##  Tolerance 95% CI
##      [0.23, 0.26]
##      [0.25, 0.28]
##      [0.83, 0.89]
##      [0.58, 0.63]
##      [0.21, 0.23]
##      [0.30, 0.33]
##      [0.41, 0.45]
##      [0.21, 0.24]
##      [0.24, 0.26]
##      [0.29, 0.33]
## 
## Moderate Correlation
## 
##     Term  VIF   VIF 95% CI Increased SE Tolerance Tolerance 95% CI
##  cent_t2 5.43 [5.15, 5.73]         2.33      0.18     [0.17, 0.19]
# check models assumptions and residuals with performance package
check_model(tmp_model1_int_all_c, verbose = T)
## `check_outliers()` does not yet support models of class `glmmTMB`.

# Compute fitted values and Pearson residuals
tmp_fitted_vals <- fitted(tmp_model1_int_all_c)
tmp_residuals <- residuals(tmp_model1_int_all_c, type = "pearson")
# Create binned residuals plot
arm::binnedplot(tmp_fitted_vals, tmp_residuals)

# simulate residuals and check model assumptions with DHARMa package
tmp_sim_model1 <- simulateResiduals(fittedModel = tmp_model1_int_all_c)
plot(tmp_sim_model1)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

The model with all interaction terms and centered predictors shows no pattern in the residuals plot from the DHARMa package and the Q-Q plot is well aligned with the 1:1 line. The binned plot from the arm package shows no clear trend. The plots and tests from the performance package show no sign of high collinearity, but a significant underdispersion (p < 0.001). Since the Q-Q plot and residuals plots look normal, this should not be a large issue. As described in the DHARMa package vignette, “From a technical side, underdispersion is not as concerning as over dispersion, as it will usually bias p-values to the conservative side”.

1.3.2 Final model choice

From the different model combinations, it appears that the best performing model is a zero-inflated Poisson GLMM with the number of ants per species as a response offset by the maximum number of ants ever found per species, which models how the different predictors affect the relative species-level ant activity. The model has 3 interaction terms: between ground temperature and impervious surface cover, between ground temperature and relative date, between ground temperature and time of day. All predictors involved in interaction terms are centered around their means.The categorical variables are modeled as factors.

# fit model with all interaction terms and centered predictors
model1 <- glmmTMB(individuals ~ cent_t2 * cent_minutes_since_8am + cent_t2 * cent_relative_day +
                                cent_t2 * cent_imp + vol_moist_mean +  
                                clouds + wind + shade +
                                (1 | site) + (1 | sp_abbr),
                              offset = log(species_max),   
                              ziformula = ~1,      # zero-inflation
                              family = poisson,   # poisson distribution
                              data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
rm(list = ls(pattern = "^tmp_"))

1.4 Results interpretation and plotting

# show model results
Anova(model1, type = "III")
## Warning in printHypothesis(L, rhs, names(b)): one or more coefficients in the hypothesis include
##      arithmetic operators in their names;
##   the printed representation of the hypothesis will be omitted
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: individuals
##                                   Chisq Df Pr(>Chisq)    
## (Intercept)                      2.8156  1  0.0933519 .  
## cent_t2                          7.8600  1  0.0050540 ** 
## cent_minutes_since_8am         107.0816  1  < 2.2e-16 ***
## cent_relative_day              166.4223  1  < 2.2e-16 ***
## cent_imp                       105.3808  1  < 2.2e-16 ***
## vol_moist_mean                   7.4032  1  0.0065109 ** 
## clouds                         314.2051  2  < 2.2e-16 ***
## wind                           388.9281  2  < 2.2e-16 ***
## shade                          270.2005  2  < 2.2e-16 ***
## cent_t2:cent_minutes_since_8am   7.4807  1  0.0062363 ** 
## cent_t2:cent_relative_day      155.3776  1  < 2.2e-16 ***
## cent_t2:cent_imp                12.1228  1  0.0004981 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model1)
##  Family: poisson  ( log )
## Formula:          
## individuals ~ cent_t2 * cent_minutes_since_8am + cent_t2 * cent_relative_day +  
##     cent_t2 * cent_imp + vol_moist_mean + clouds + wind + shade +  
##     (1 | site) + (1 | sp_abbr)
## Zero inflation:               ~1
## Data: na.omit(env_analysis)
##  Offset: log(species_max)
## 
##      AIC      BIC   logLik deviance df.resid 
##   5352.3   5467.4  -2658.2   5316.3     4410 
## 
## Random effects:
## 
## Conditional model:
##  Groups  Name        Variance Std.Dev.
##  site    (Intercept) 8.016    2.831   
##  sp_abbr (Intercept) 1.016    1.008   
## Number of obs: 4428, groups:  site, 33; sp_abbr, 27
## 
## Conditional model:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    -8.895e-01  5.301e-01  -1.678 0.093352 .  
## cent_t2                        -3.431e-02  1.224e-02  -2.804 0.005054 ** 
## cent_minutes_since_8am          2.918e-03  2.820e-04  10.348  < 2e-16 ***
## cent_relative_day              -2.693e-02  2.087e-03 -12.900  < 2e-16 ***
## cent_imp                       -5.001e-02  4.871e-03 -10.266  < 2e-16 ***
## vol_moist_mean                 -1.140e-02  4.190e-03  -2.721 0.006511 ** 
## cloudslightly cloudy           -2.058e+00  1.164e-01 -17.676  < 2e-16 ***
## cloudsvery cloudy              -2.191e+00  1.325e-01 -16.536  < 2e-16 ***
## windweak wind                  -6.242e-01  9.113e-02  -6.850 7.40e-12 ***
## windstrong wind                 6.868e-01  1.095e-01   6.273 3.54e-10 ***
## shadehalf-shade                -1.123e+00  6.838e-02 -16.423  < 2e-16 ***
## shadefull shade                -9.097e-01  8.737e-02 -10.411  < 2e-16 ***
## cent_t2:cent_minutes_since_8am -1.632e-04  5.968e-05  -2.735 0.006236 ** 
## cent_t2:cent_relative_day       4.666e-03  3.743e-04  12.465  < 2e-16 ***
## cent_t2:cent_imp                1.056e-03  3.033e-04   3.482 0.000498 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Zero-inflation model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.42567    0.08326   29.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# show model performance
model_performance(model1)
## # Indices of model performance
## 
## AIC      |     AICc |      BIC | R2 (cond.) | R2 (marg.) |   ICC |   RMSE
## -------------------------------------------------------------------------
## 5352.322 | 5352.477 | 5467.445 |      0.639 |      0.197 | 0.550 | 16.016
## 
## AIC      | Sigma | Score_log | Score_spherical
## ----------------------------------------------
## 5352.322 | 1.000 |    -0.570 |           0.015
# Extract fixed effects and compute IRR
tmp_model1_fixed_effects <- tidy(model1, effects = "fixed") %>%
  mutate(IRR = exp(estimate)) %>%
  select(term, estimate, std.error, p.value, IRR)

tmp_model1_fixed_effects
## # A tibble: 16 × 5
##    term                            estimate std.error   p.value    IRR
##    <chr>                              <dbl>     <dbl>     <dbl>  <dbl>
##  1 (Intercept)                    -0.889    0.530     9.34e-  2  0.411
##  2 cent_t2                        -0.0343   0.0122    5.05e-  3  0.966
##  3 cent_minutes_since_8am          0.00292  0.000282  4.27e- 25  1.00 
##  4 cent_relative_day              -0.0269   0.00209   4.47e- 38  0.973
##  5 cent_imp                       -0.0500   0.00487   1.01e- 24  0.951
##  6 vol_moist_mean                 -0.0114   0.00419   6.51e-  3  0.989
##  7 cloudslightly cloudy           -2.06     0.116     6.44e- 70  0.128
##  8 cloudsvery cloudy              -2.19     0.133     2.02e- 61  0.112
##  9 windweak wind                  -0.624    0.0911    7.40e- 12  0.536
## 10 windstrong wind                 0.687    0.109     3.54e- 10  1.99 
## 11 shadehalf-shade                -1.12     0.0684    1.31e- 60  0.325
## 12 shadefull shade                -0.910    0.0874    2.20e- 25  0.403
## 13 cent_t2:cent_minutes_since_8am -0.000163 0.0000597 6.24e-  3  1.00 
## 14 cent_t2:cent_relative_day       0.00467  0.000374  1.16e- 35  1.00 
## 15 cent_t2:cent_imp                0.00106  0.000303  4.98e-  4  1.00 
## 16 (Intercept)                     2.43     0.0833    1.28e-186 11.3
# report model results
report(model1)
## We fitted a zero-inflated poisson mixed model (estimated using ML and nlminb
## optimizer) to predict individuals with cent_t2, cent_minutes_since_8am,
## cent_relative_day, cent_imp, vol_moist_mean, clouds, wind and shade (formula:
## individuals ~ cent_t2 * cent_minutes_since_8am + cent_t2 * cent_relative_day +
## cent_t2 * cent_imp + vol_moist_mean + clouds + wind + shade). The model
## included site as random effects (formula: list(~1 | site, ~1 | sp_abbr)). The
## model's total explanatory power is substantial (conditional R2 = 0.64) and the
## part related to the fixed effects alone (marginal R2) is of 0.20. The model's
## intercept, corresponding to cent_t2 = 0, cent_minutes_since_8am = 0,
## cent_relative_day = 0, cent_imp = 0, vol_moist_mean = 0, clouds = no clouds,
## wind = no wind and shade = no shade, is at -0.89 (95% CI [-1.93, 0.15], p =
## 0.093). Within this model:
## 
##   - The effect of cent t2 is statistically significant and negative (beta =
## -0.03, 95% CI [-0.06, -0.01], p = 0.005; Std. beta = -0.11, 95% CI [-0.19,
## -0.02])
##   - The effect of cent minutes since 8am is statistically significant and
## positive (beta = 2.92e-03, 95% CI [2.37e-03, 3.47e-03], p < .001; Std. beta =
## 0.51, 95% CI [0.45, 0.57])
##   - The effect of cent relative day is statistically significant and negative
## (beta = -0.03, 95% CI [-0.03, -0.02], p < .001; Std. beta = -0.72, 95% CI
## [-0.79, -0.65])
##   - The effect of cent imp is statistically significant and negative (beta =
## -0.05, 95% CI [-0.06, -0.04], p < .001; Std. beta = -1.45, 95% CI [-1.68,
## -1.22])
##   - The effect of vol moist mean is statistically significant and negative (beta
## = -0.01, 95% CI [-0.02, -3.19e-03], p = 0.007; Std. beta = -0.09, 95% CI
## [-0.14, -0.04])
##   - The effect of clouds [lightly cloudy] is statistically significant and
## negative (beta = -2.06, 95% CI [-2.29, -1.83], p < .001; Std. beta = -2.20, 95%
## CI [-2.29, -2.11])
##   - The effect of clouds [very cloudy] is statistically significant and negative
## (beta = -2.19, 95% CI [-2.45, -1.93], p < .001; Std. beta = -2.28, 95% CI
## [-2.38, -2.18])
##   - The effect of wind [weak wind] is statistically significant and negative
## (beta = -0.62, 95% CI [-0.80, -0.45], p < .001; Std. beta = -0.70, 95% CI
## [-0.76, -0.63])
##   - The effect of wind [strong wind] is statistically significant and positive
## (beta = 0.69, 95% CI [0.47, 0.90], p < .001; Std. beta = 0.60, )
##   - The effect of shade [half-shade] is statistically significant and negative
## (beta = -1.12, 95% CI [-1.26, -0.99], p < .001; Std. beta = -1.15, 95% CI
## [-1.25, -1.04])
##   - The effect of shade [full shade] is statistically significant and negative
## (beta = -0.91, 95% CI [-1.08, -0.74], p < .001; Std. beta = -0.89, )
##   - The effect of cent t2 × cent minutes since 8am is statistically significant
## and negative (beta = -1.63e-04, 95% CI [-2.80e-04, -4.63e-05], p = 0.006; Std.
## beta = -0.15, 95% CI [-0.22, -0.08])
##   - The effect of cent t2 × cent relative day is statistically significant and
## positive (beta = 4.67e-03, 95% CI [3.93e-03, 5.40e-03], p < .001; Std. beta =
## 0.76, 95% CI [0.66, 0.86])
##   - The effect of cent t2 × cent imp is statistically significant and positive
## (beta = 1.06e-03, 95% CI [4.62e-04, 1.65e-03], p < .001; Std. beta = 0.15, 95%
## CI [0.10, 0.21])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.
# get average values of all predictors involved in interactions
mean(na.omit(env_analysis$t2_mean))               # mean ground temperature
## [1] 24.24898
mean(na.omit(env_analysis$imp_mean))              # mean impervious surface cover
## [1] 43.29771
mean(na.omit(env_analysis$minutes_since_8am))     # mean exp start time
## [1] 241.8511
mean(na.omit(env_analysis$relative_day))          # mean exp date
## [1] 59.63866

I fitted a zero-inflated Poisson generalized linear mixed model (GLMM) (estimated using ML and nlminb optimizer) to analyze the effects of environmental conditions and urbanization on species-level ant foraging activity. Ant foraging activity was modeled as the species-specific number of ants collected at the end of each experiment, offset by the species’ maximum observed number of individuals across experiments. The environmental conditions and urbanization variables included as predictor variables (fixed effects) were ground temperature, soil moisture, mean impervious surface cover in a 100m radius around the experiment, time of day, date of experiment, cloud cover, wind, and shade. Three interaction terms were included to test whether the effect of temperature varied with impervious surface cover, date of experiment, and time of day. All variables included in the interaction terms were centered around their mean values to reduce multicollinearity (formula: ant activity ~ centered temperature * centered time of day + centered temperature * centered date of experiment + centered temperature * centered impervious surface cover + soil moisture + cloud cover + wind + shade). The model included a random intercept for site and species (formulas: ~ 1|site, ~1|species).

The model’s total explanatory power is substantial (conditional R2 = 0.64) and the part related to the fixed effects alone (marginal R2) is of 0.20. The variance of the site random effect (variance = 8.016) indicates that there is substantial site-level variation in ant activity. The smaller variance of the species random effect (variance = 1.016) suggests moderate species-level differences in ant activity.

The model fit yielded an AIC of 5352.3 and a BIC of 5467.4, with a log-likelihood of -2658.2 and a deviance of 5316.3. The model included 4428 observations, with random effects for 33 sites and 27 species.

The type III Wald chi-square tests from the analysis of deviance table indicate significant effects of all predictor variables and interaction terms (see ANOVA table, p < 0.05).

Since all variables involved in the interactions are centered around their mean values, the main effect of temperature at average impervious surface cover (43.30%), average experiment start time (241.85 minutes after 8:00 AM or 12:02 PM), and average experiment date (59.64 or June 10) is significantly negative (beta = −0.03, SE = 0.01, z = -2.80, P = 0.005). However, the effect of temperature on ant foraging activity varies significantly depending on impervious surface cover, experiment start time, and experiment date. Specifically, the effect of temperature on ant foraging activity becomes significantly less negative in more urbanized areas (beta = 0.001, SE = 0.0003, z = 3.48, p = 0.0005). The effect of temperature also becomes significantly less negative in experiments done at later dates in the season (beta = 0.005 SE = 0.0004, z = 12.47, p < 2e-16). The effect of temperature becomes significantly more negative when the experiment start is later in the day (beta = -0.0002, SE = 0.00006, z = -2.74, p = 0.006).

At an average ground temperature (24.24 °C), impervious surface cover has a significantly negative impact on ant foraging activity (beta = -0.05, SE = 0.005, z = -10.27, p < 2e-16), and so does date of experiment (i.e., at average temperatures, ant foraging activity decreases later in the season) (beta = -0.03, SE = 0.002, z = -12.90, p < 2e-16). However, at average ground temperatures, ant foraging activity significantly increases as experiments are started later in the day (beta = 0.003, SE = 0.0003, z = 10.35, p < 2e-16).

Soil moisture significantly decreases ant foraging activity (beta = -0.01, SE = 0.004, z = -2.72, p = 0.007). In terms of cloud cover and compared to the reference level “no clouds”, lightly cloudy conditions significantly increase ant foraging activity (beta = 1.42, SE = 0.08, z = 17.5, p < 2e-16) while very cloudy conditions decrease ant foraging activity (beta = -0.64, SE = 0.04, z = -15.19, p < 2e-16). Compared to the absence of wind, weak wind does not significantly influence ant foraging activity (beta = -0.02, SE = 0.06, z = -0.33, p = 0.74). However, strong wind has a significant negative impact on and foraging activity (beta = -0.65, SE = 0.04, z = -16.28, p < 2e-16). Compared to fully sunny conditions (no shade), half-shade had a significant positive impact on ant foraging activity (beta = 0.68, SE = 0.05, z = 14.56, p < 2e-16) while full shade had a significant negative impact (beta = -0.45, SE = 0.04, z = -12.20, p < 2e-16).

I can plot the interaction terms to visualize how the effect of temperature on ant activity varies with impervious surface cover, time of day and date of experiment.

# get centered values for 0th, 25th, 50th, 75th, 100th percentiles
tmp_centered_imp <- quantile(env_analysis$cent_imp, probs = c(0, 0.25, 0.50, 0.75, 1), na.rm = T)

# Get predicted values for temperature at specified quantiles of impervious surface
tmp_pred_int <- ggpredict(model1, terms = c("cent_t2", paste0("cent_imp [", paste(tmp_centered_imp, collapse = ", "), "]")))
## You are calculating adjusted predictions on the population-level (i.e.
##   `type = "fixed"`) for a *generalized* linear mixed model.
##   This may produce biased estimates due to Jensen's inequality. Consider
##   setting `bias_correction = TRUE` to correct for this bias.
##   See also the documentation of the `bias_correction` argument.
# plot effect of temperature on ant activity at different urbanization levels
(tmp_plot_int_imp <- ggplot(tmp_pred_int, aes(x = x, y = predicted, color = group)) + 
  geom_line(size = 1) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high, fill = group), alpha = 0.05, color = NA) +  # adds 95% CI
  labs(x = "Centered ground temperature (\u00B0C)", y = "Predicted species-level ant foraging activity\n(predicted number of individuals)", 
       title = "Interaction of ground temperature\n and impervious surface cover") +
  scale_color_viridis_d() +  # adds viridis palette  
  scale_fill_viridis_d() +   # adds viridis palette   
  theme_classic2() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5, size = 12, margin = margin(b = 15)),
        axis.title.x = element_blank(),  # remove axis title
        axis.title.y = element_text(margin = margin(r = 10), size = 10)) # adjust distance from axis title to axis
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# get centered values for 0th, 25th, 50th, 75th, 100th percentiles
tmp_centered_start_times <- quantile(env_analysis$cent_minutes_since_8am, probs = c(0, 0.25, 0.50, 0.75, 1), na.rm = T)

# Get predicted values for temperature at specified quantiles of impervious surface
tmp_pred_time <- ggpredict(model1, terms = c("cent_t2", paste0("cent_minutes_since_8am [", paste(tmp_centered_start_times, collapse = ", "), "]")))

# Plot effect of temperature on ant activity at different start times
(tmp_plot_int_time <- ggplot(tmp_pred_time, aes(x = x, y = predicted, color = group)) +
  geom_line(size = 1) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high, fill = group), alpha = 0.1, color = NA) +  # adds 95% CI
  labs(x = "Centered ground temperature (\u00B0C)", y = "Predicted species-level ant foraging activity", 
       title = "Interaction of ground temperature\n and experiment start time") +
  scale_color_viridis_d() +  # adds viridis palette
  scale_fill_viridis_d() +   # adds viridis palette
  theme_classic2() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5, size = 12, margin = margin(b = 15)),
        axis.title.x = element_text(margin = margin(t = 20), size = 10),  # adjust distance from axis title to axis
        axis.title.y = element_blank())  # remove axis title
)

# get centered values for 0th, 25th, 50th, 75th, 100th percentiles
tmp_centered_dates <- quantile(env_analysis$cent_relative_day, probs = c(0, 0.25, 0.50, 0.75, 1), na.rm = T)

# Get predicted values for temperature at specified quantiles of impervious surface
tmp_pred_date <- ggpredict(model1, terms = c("cent_t2", paste0("cent_relative_day [", paste(tmp_centered_dates, collapse = ", "), "]")))

# Define custom labels
tmp_date_labels <- c("0th", "25th", "50th", "75th", "100th")

# Plot effect of temperature on ant activity at different experiment dates
(tmp_plot_int_date <- ggplot(tmp_pred_date, aes(x = x, y = predicted, color = group)) +
  geom_line(size = 1) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high, fill = group), alpha = 0.05, color = NA) +  # adds 95% CI
  labs(title = "Interaction of ground temperature\n and date of experiment",
       color = "Percentile", fill = "Percentile") +
  scale_color_viridis_d(labels = tmp_date_labels) +  # adds custom labels and viridis palette
  scale_fill_viridis_d(labels = tmp_date_labels) +   # adds custom labels and viridis palette
  theme_classic2() +
  theme(legend.text = element_text(size = 10),  # set legend text
        legend.title = element_text(size = 12, hjust = 0, vjust = 2),  # Smaller legend title
        legend.key.size = unit(0.5, "cm"),  # Makes the legend squares smaller
        plot.title = element_text(hjust = 0.5, size = 12, margin = margin(b = 15)),
        axis.title.x = element_blank(),  # remove axis title
        axis.title.y = element_blank()) + # remove axis title
  guides(fill = guide_legend(ncol = 1))  # Set legend to 1 column
)

# combine all interaction plots into 1 plot
(tmp_interaction_plots <- tmp_plot_int_imp + tmp_plot_int_time + tmp_plot_int_date +
    plot_layout(nrow = 1) + plot_annotation(tag_levels = 'a', tag_suffix = ")"))  # Arrange in 1 row, adds a), b), c)

# show different percentile values for each predictor
quantile(env_analysis$imp_mean, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE)
##       0%      25%      50%      75%     100% 
##  0.00000 27.32551 48.20256 62.71733 93.84142
quantile(env_analysis$minutes_since_8am, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE)
##   0%  25%  50%  75% 100% 
##   10  106  197  318  636
quantile(env_analysis$relative_day, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE)
##   0%  25%  50%  75% 100% 
##    1   39   50   75  148

This graph shows predicted species-level ant foraging activity as a function of ground temperature across different levels of impervious surface cover (expressed in % of ground covered by impervious surfaces in a 100m radius around the experiment), time at which the experiment started (expressed in minutes since 8:00 AM), and date of experiment (expressed as a relative date, with the experiment done earliest in the year as day 1). The predictions are based on a zero-inflated Poisson GLMM with the number of ant individuals per species as a response and the maximum observed number of individuals per species included as an offset to account for species-level differences in colony size and foraging strategies. Predicted values reflect the expected number of individuals for an average ant species, shown here with the 95% confidence interval. The predicted values at the 0th, 25th, 50th, 75th, and 100th percentiles of impervious surface cover (0th = 0%, 25th = 27.33%, 50th = 48.20%, 75th = 62.72%, 100th = 93.84%), experiment start time (0th = 10 min or 8:10 AM, 25th = 106 min or 9:46 AM, 50th = 197 min or 11:17 AM, 75th = 318 min or 1:18 PM, 100th = 636 min or 6:36 PM) and experiment date (0th = day 1 or April 12, 25th = day 39 or May 20, 50th = day 50 or May 31, 75th = day 75 or June 25, 100th = day 148 or September 6) are shown.

To visualize how each environmental variable affects ant foraging activity and see which variables have the strongest effect, I can plot the effect sizes of the model.

# show model IRRs (exp(estimates))
tmp_model1_fixed_effects <- tidy(model1, effects = "fixed") %>%
  mutate(IRR = exp(estimate)) %>%
  select(term, estimate, std.error, p.value, IRR)

tmp_model1_fixed_effects
## # A tibble: 16 × 5
##    term                            estimate std.error   p.value    IRR
##    <chr>                              <dbl>     <dbl>     <dbl>  <dbl>
##  1 (Intercept)                    -0.889    0.530     9.34e-  2  0.411
##  2 cent_t2                        -0.0343   0.0122    5.05e-  3  0.966
##  3 cent_minutes_since_8am          0.00292  0.000282  4.27e- 25  1.00 
##  4 cent_relative_day              -0.0269   0.00209   4.47e- 38  0.973
##  5 cent_imp                       -0.0500   0.00487   1.01e- 24  0.951
##  6 vol_moist_mean                 -0.0114   0.00419   6.51e-  3  0.989
##  7 cloudslightly cloudy           -2.06     0.116     6.44e- 70  0.128
##  8 cloudsvery cloudy              -2.19     0.133     2.02e- 61  0.112
##  9 windweak wind                  -0.624    0.0911    7.40e- 12  0.536
## 10 windstrong wind                 0.687    0.109     3.54e- 10  1.99 
## 11 shadehalf-shade                -1.12     0.0684    1.31e- 60  0.325
## 12 shadefull shade                -0.910    0.0874    2.20e- 25  0.403
## 13 cent_t2:cent_minutes_since_8am -0.000163 0.0000597 6.24e-  3  1.00 
## 14 cent_t2:cent_relative_day       0.00467  0.000374  1.16e- 35  1.00 
## 15 cent_t2:cent_imp                0.00106  0.000303  4.98e-  4  1.00 
## 16 (Intercept)                     2.43     0.0833    1.28e-186 11.3
# plot fixed effects with IRR directly
(tmp_effect_sizes <- plot_model(model1, type = "est", show.values = TRUE, 
                                value.offset = 0.3, digits = 4, dot.size = 1, value.size = 3) +
  geom_hline(yintercept = 1, color = "black", linetype = 3) +   # add dashed vertical line at 1
  theme_bw() +
  ggtitle("Effect sizes of predictors on ant foraging activity") +
  theme(plot.title = element_text(hjust = 0.5, size = 12),
        axis.text.x = element_text(size = 9),
        axis.text.y = element_text(size = 9, hjust = 1)) + 
  scale_x_discrete(labels = c(                                 # rename the predictors
    "cent_t2" = "Ground temperature (centered)",  
    "cent_imp" = "Impervious surface cover (centered)",
    "vol_moist_mean" = "Soil moisture",
    "cent_relative_day" = "Date of experiment (centered)",
    "cent_minutes_since_8am" = "Experiment start time (centered)",
    "clouds1" = "Cloud cover (lightly cloudy)",
    "clouds2" = "Cloud cover (very cloudy)",
    "wind1" = "Wind (weak wind)",
    "wind2" = "Wind (strong wind)",
    "shade1" = "Shade (half-shade)",
    "shade2" = "Shade (full shade)",
    "cent_t2:cent_minutes_since_8am" = "Interaction of temperature and start time (centered)",
    "cent_t2:cent_relative_day" = "Interaction of temperature and date (centered)",
    "cent_t2:cent_imp" = "Interaction of temperature and impervious surface (centered)"
  ))  # Custom labels for the predictors
)
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.

# plot random effects
plot_model(model1, type = "re", vline.color = "black")
## [[1]]

## 
## [[2]]

rm(list = ls(pattern = "^tmp_"))

2. Research Question 2

The overarching research question is: Can ants differentiate between baits of different sugar concentrations, and do they forage more actively on high sugar concentration baits?

More specifically, is there a difference in the average number of ants collected on the baits with different sugar concentrations at the end of the baiting experiments ? Is there an increase in the number of ants present on baits with a higher sugar concentration ?

  • H0: the average number of ants found at the end of the experiments the same across all sugar concentrations
  • H1: the average number of ants found at the end of the experiments is significantly different between the different concentrations, and significantly increases with increasing sugar concentration.

This research question is treated in two parts: first, I test the effect of sugar concentration on ant foraging activity in terms of presence or absence of ants on the different sugar concentrations. Then, when ants are present on the experiments, I test the effect of sugar concentrations on and foraging activity in terms of species-specific abundance of ants on the different sugar concentrations.

Description of tested variables

  • Two response variables:

    • overall (non species-specific) presence or absence of ants on each bait at the end of each experiment (binary)
    • species-specific proportion of ants found on each bait at the end of each experiment (quantitative, rate between 0 and 1)
  • Explanatory variables: the sugar concentration of the sugar-water solutions added to the baits (either discrete quantitative or ordinal categorical)

  • Random effects:

    • Site (nominal categorical)
    • Species (nominal categorical)

2.1 Presence/absence of ants

I start by modelling the effect of sugar concentration on the presence or absence of ants on the baits. The question is: are ants more likely to be present on certain sugar concentrations than others ?

2.1.1 Exploratory data analysis

I calculate the proportion of baits where ants were present and absent, then I can visualize ant presence vs sugar concentration.This is done both for the species-level table (sugar_analysis) and the overall table (sugar_analysis_overall).

I need to model overall presence and absence, not species-level presence because it adds a lot of “artificial/structural” zeros from species which were not observed on an experiment (most experiments only had 1-2 species present, up to 5).

# calculate proportion of presence at species-level
table(sugar_analysis$presence)  
## 
##     0     1 
## 27222   453
prop.table(table(sugar_analysis$presence))  # Proportion of 1s and 0s
## 
##          0          1 
## 0.98363144 0.01636856
# calculate overall proportion of presence
table(sugar_analysis_overall$presence)  
## 
##   0   1 
## 645 380
prop.table(table(sugar_analysis_overall$presence))  # Proportion of 1s and 0s
## 
##         0         1 
## 0.6292683 0.3707317
# visualize ant presence vs concentration at species-level
ggplot(na.omit(sugar_analysis), aes(x = factor(concentration), fill = factor(presence))) +
  geom_bar(position = "fill", color = "black") +  # add black outline
  scale_fill_manual(values = c("white", "black"),
                    labels = c("Absence", "Presence")) + 
  labs(x = "Sugar concentration (%)", y = "Proportion of baits occupied", fill = "") +
  theme_bw() +
  theme(panel.grid.major = element_blank(),  # Remove major grid
        panel.grid.minor = element_blank(),    # Remove minor grid
        axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5), # adjust distance from axis title to axis, center title
        axis.title.x = element_text(margin = margin(t = 10)))

# visualize ant presence vs concentration overall
(tmp_raw_presence_plot <- ggplot(na.omit(sugar_analysis_overall), aes(x = factor(concentration), fill = factor(presence))) +
  geom_bar(position = "fill", color = "black", size = 0.2) +  # add black outline
  scale_fill_manual(values = c("white", "black"),
                    labels = c("Absence", "Presence")) +  # Apply Acadia discrete color scale
  labs(x = "Sugar concentration (%)", y = "Observed proportion of baits occupied by ants", fill = "") +
  theme_bw() +
  theme(panel.grid.major = element_blank(),  # Remove major grid
        panel.grid.minor = element_blank(),    # Remove minor grid
        legend.position = "right",           # set legend position
        legend.text = element_text(size = 8),  # set legend text
        legend.key.size = unit(0.4, "cm"),  # Makes the legend squares smaller
        axis.title.y = element_text(size = 10, margin = margin(r = 10), hjust = 0.5), # adjust distance from axis title to axis, center title
        axis.title.x = element_text(size = 10, margin = margin(t = 10))))

2.1.2 Model choice and model assumptions

For testing the influence of sugar concentration on ant presence or absence, my response variable is binary (1 = ants are present on a given bait, 0 = no ants were found on the bait). For binary data with random effects (the site variable), I run a logistic GLMM.

The sugar concentration predictor can either be considered as a discrete numerical variable or a ordinal categorical variable. I will fit two models with these two options and check which model performs best.

I am modelling the overall ant presence/absence, NOT the species-level data.

# fit model with numerical concentration
tmp_model3_pres_num <- glmer(presence ~ as.numeric(concentration) + (1 | site),
                        data = sugar_analysis_overall, family = binomial,
                        control = glmerControl(optimizer = "bobyqa"))

# show summary table to ensure the model has converged (no warnings)
summary(tmp_model3_pres_num)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: presence ~ as.numeric(concentration) + (1 | site)
##    Data: sugar_analysis_overall
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##   1159.7   1174.5   -576.9   1153.7     1007 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.2603 -0.7111 -0.3302  0.8102  4.1420 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  site   (Intercept) 1.894    1.376   
## Number of obs: 1010, groups:  site, 36
## 
## Fixed effects:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -1.301343   0.269468  -4.829 1.37e-06 ***
## as.numeric(concentration)  0.043983   0.005328   8.255  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## as.nmrc(cn) -0.327
# simulate residuals and check model assumptions with DHARMa package
tmp_sim_model3_pres_num <- simulateResiduals(fittedModel = tmp_model3_pres_num)
plot(tmp_sim_model3_pres_num)

# fit model with categorical concentration
tmp_model3_pres_cat <- glmer(presence ~ as.factor(concentration) + (1 | site),
                        data = sugar_analysis_overall, family = binomial,
                        control = glmerControl(optimizer = "bobyqa"))

# show summary table to ensure the model has converged (no warnings)
summary(tmp_model3_pres_cat)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: presence ~ as.factor(concentration) + (1 | site)
##    Data: sugar_analysis_overall
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##   1137.3   1166.8   -562.7   1125.3     1004 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.1386 -0.6349 -0.2993  0.7563  5.7872 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  site   (Intercept) 2.015    1.419   
## Number of obs: 1010, groups:  site, 36
## 
## Fixed effects:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -2.1250     0.3307  -6.427 1.30e-10 ***
## as.factor(concentration)5    1.3603     0.2700   5.038 4.71e-07 ***
## as.factor(concentration)10   1.6683     0.2675   6.238 4.44e-10 ***
## as.factor(concentration)20   1.7733     0.2680   6.616 3.70e-11 ***
## as.factor(concentration)40   2.4261     0.2715   8.934  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) as.()5 a.()10 a.()20
## as.fctr(c)5 -0.515                     
## as.fctr()10 -0.523  0.640              
## as.fctr()20 -0.524  0.640  0.649       
## as.fctr()40 -0.529  0.639  0.649  0.651
# simulate residuals and check model assumptions with DHARMa package
tmp_sim_model3_pres_cat <- simulateResiduals(fittedModel = tmp_model3_pres_cat)
plot(tmp_sim_model3_pres_cat)

compare_performance(tmp_model3_pres_num, tmp_model3_pres_cat)
## # Comparison of Model Performance Indices
## 
## Name                |    Model |  AIC (weights) | AICc (weights)
## ----------------------------------------------------------------
## tmp_model3_pres_num | glmerMod | 1159.7 (<.001) | 1159.8 (<.001)
## tmp_model3_pres_cat | glmerMod | 1137.3 (>.999) | 1137.4 (>.999)
## 
## Name                |  BIC (weights) | R2 (cond.) | R2 (marg.) |   ICC |  RMSE
## ------------------------------------------------------------------------------
## tmp_model3_pres_num | 1174.5 (0.021) |      0.410 |      0.070 | 0.365 | 0.419
## tmp_model3_pres_cat | 1166.8 (0.979) |      0.447 |      0.109 | 0.380 | 0.411
## 
## Name                | Sigma | Log_loss | Score_log | Score_spherical
## --------------------------------------------------------------------
## tmp_model3_pres_num | 1.000 |    0.522 |      -Inf |           0.003
## tmp_model3_pres_cat | 1.000 |    0.507 |      -Inf |           0.003

When comparing model performance and model diagnostics plots, it appears that the model with concentration as a factor performs significantly better, so that is the model I use for the analysis.

# final model
model3_presence <- glmer(presence ~ as.factor(concentration) + (1 | site),
                        data = sugar_analysis_overall, family = binomial,
                        control = glmerControl(optimizer = "bobyqa"))

# Check for overdispersion with performance package
check_overdispersion(model3_presence)
## # Overdispersion test
## 
##  dispersion ratio = 0.979
##           p-value =  0.52
## No overdispersion detected.
# check model assumptions and residuals with performance package
check_model(model3_presence, verbose = T)
## Not enough model terms in the conditional part of the model to check for
##   multicollinearity.
## Failed to compute posterior predictive checks with `re_formula=NULL`.
##   Trying again with `re_formula=NA` now.

# check residuals with DHARMa package
tmp_model3_pres_sim <- simulateResiduals(fittedModel = model3_presence)
plot(tmp_model3_pres_sim)

The model shows no signs of overdispersion, the outlier test performed by the DHARMa package is not significant, and the residuals vs predicted values plot as well as the binned residuals plot do not show blatant or strong patterns in the residuals.

2.1.3 Results interpretation and visualization
# show model results
Anova(model3_presence, type = "II")
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: presence
##                          Chisq Df Pr(>Chisq)    
## as.factor(concentration) 83.15  4  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model3_presence)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: presence ~ as.factor(concentration) + (1 | site)
##    Data: sugar_analysis_overall
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##   1137.3   1166.8   -562.7   1125.3     1004 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.1386 -0.6349 -0.2993  0.7563  5.7872 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  site   (Intercept) 2.015    1.419   
## Number of obs: 1010, groups:  site, 36
## 
## Fixed effects:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -2.1250     0.3307  -6.427 1.30e-10 ***
## as.factor(concentration)5    1.3603     0.2700   5.038 4.71e-07 ***
## as.factor(concentration)10   1.6683     0.2675   6.238 4.44e-10 ***
## as.factor(concentration)20   1.7733     0.2680   6.616 3.70e-11 ***
## as.factor(concentration)40   2.4261     0.2715   8.934  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) as.()5 a.()10 a.()20
## as.fctr(c)5 -0.515                     
## as.fctr()10 -0.523  0.640              
## as.fctr()20 -0.524  0.640  0.649       
## as.fctr()40 -0.529  0.639  0.649  0.651
# Convert model estimates to Odds Ratios
tmp_model_or <- tidy(model3_presence, effects = "fixed") %>%
  mutate(OR = exp(estimate),  # Convert log-odds to ORs
         OR_low = exp(estimate - 1.96 * std.error),  # Lower CI
         OR_high = exp(estimate + 1.96 * std.error)) # Upper CI
print(tmp_model_or)
## # A tibble: 5 × 9
##   effect term        estimate std.error statistic  p.value     OR OR_low OR_high
##   <chr>  <chr>          <dbl>     <dbl>     <dbl>    <dbl>  <dbl>  <dbl>   <dbl>
## 1 fixed  (Intercept)    -2.13     0.331     -6.43 1.30e-10  0.119 0.0625   0.228
## 2 fixed  as.factor(…     1.36     0.270      5.04 4.71e- 7  3.90  2.30     6.62 
## 3 fixed  as.factor(…     1.67     0.267      6.24 4.44e-10  5.30  3.14     8.96 
## 4 fixed  as.factor(…     1.77     0.268      6.62 3.70e-11  5.89  3.48     9.96 
## 5 fixed  as.factor(…     2.43     0.272      8.93 4.09e-19 11.3   6.65    19.3
# report model results
report(model3_presence)
## We fitted a logistic mixed model (estimated using ML and BOBYQA optimizer) to
## predict presence with concentration (formula: presence ~
## as.factor(concentration)). The model included site as random effect (formula:
## ~1 | site). The model's total explanatory power is substantial (conditional R2
## = 0.45) and the part related to the fixed effects alone (marginal R2) is of
## 0.11. The model's intercept, corresponding to concentration = 0, is at -2.13
## (95% CI [-2.77, -1.48], p < .001). Within this model:
## 
##   - The effect of concentration [5] is statistically significant and positive
## (beta = 1.36, 95% CI [0.83, 1.89], p < .001; Std. beta = 1.36, 95% CI [0.83,
## 1.89])
##   - The effect of concentration [10] is statistically significant and positive
## (beta = 1.67, 95% CI [1.14, 2.19], p < .001; Std. beta = 1.67, 95% CI [1.14,
## 2.19])
##   - The effect of concentration [20] is statistically significant and positive
## (beta = 1.77, 95% CI [1.25, 2.30], p < .001; Std. beta = 1.77, 95% CI [1.25,
## 2.30])
##   - The effect of concentration [40] is statistically significant and positive
## (beta = 2.43, 95% CI [1.89, 2.96], p < .001; Std. beta = 2.43, 95% CI [1.89,
## 2.96])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.

I fitted a generalized linear mixed model (GLMM) with a binomial distribution and a logit link function (estimated using ML (Laplace Approximation) and BOBYQA optimizer) to analyze the effect of sugar concentration on the presence or absence of foraging ants on baits (formula: presence ~ as.factor(concentration)). The model included site as a random effect (formula: ~ 1|site). The model’s total explanatory power is substantial (conditional R2 = 0.45) and the part related to the fixed effects alone (marginal R2) is of 0.11. The variance of the site random effect (variance = 2.015) indicates that there is variability in ant presence across different sites.

Sugar concentration shows a positive and highly significant effect on ant presence (Type II Wald Chi-square test, Chisq(4) = 83.16, p < 2.2e-16), showing that baits with sugar water increase the probability of ants being present on the baits compared to pure water baits. Compared to the reference level (0% sugar), the odds of ant presence increased significantly at all concentrations. More specifically, for each concentration (as obtained by Wald z-tests):
- The effect of 5% sugar is statistically significant and positive (beta = 1.36, SE = 0.27, z = 5.04, p = 4.70e-07)
- The effect of 10% sugar is statistically significant and positive (beta = 1.67, SE = 0.27,z = 6.24, p = 4.44e-10)
- The effect of 20% sugar is statistically significant and positive (beta = 1.77, SE = 0.27, z = 6.62 , p = 3.69e-11)
- The effect of 40% sugar is statistically significant and positive (beta = 2.43, SE = 0.27, z = 8.94, p < 2e-16)

I can now run a post-hoc test to test for significant differences between the concentrations.

# compute pairwise comparisons between the concentrations
tmp_model3_pres_emm <- emmeans(model3_presence, pairwise ~ concentration, adjust = "tukey")

print(tmp_model3_pres_emm)
## $emmeans
##  concentration emmean    SE  df asymp.LCL asymp.UCL
##              0 -2.125 0.331 Inf    -2.773    -1.477
##              5 -0.765 0.301 Inf    -1.354    -0.176
##             10 -0.457 0.297 Inf    -1.039     0.126
##             20 -0.352 0.297 Inf    -0.934     0.231
##             40  0.301 0.297 Inf    -0.281     0.883
## 
## Results are given on the logit (not the response) scale. 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast                          estimate    SE  df z.ratio p.value
##  concentration0 - concentration5     -1.360 0.270 Inf  -5.038  <.0001
##  concentration0 - concentration10    -1.668 0.267 Inf  -6.238  <.0001
##  concentration0 - concentration20    -1.773 0.268 Inf  -6.616  <.0001
##  concentration0 - concentration40    -2.426 0.272 Inf  -8.934  <.0001
##  concentration5 - concentration10    -0.308 0.228 Inf  -1.350  0.6599
##  concentration5 - concentration20    -0.413 0.228 Inf  -1.809  0.3682
##  concentration5 - concentration40    -1.066 0.230 Inf  -4.629  <.0001
##  concentration10 - concentration20   -0.105 0.224 Inf  -0.468  0.9902
##  concentration10 - concentration40   -0.758 0.226 Inf  -3.358  0.0070
##  concentration20 - concentration40   -0.653 0.225 Inf  -2.898  0.0308
## 
## Results are given on the log odds ratio (not the response) scale. 
## P value adjustment: tukey method for comparing a family of 5 estimates

To test for pairwise differences between the different sugar concentration levels, I conducted a post-hoc analysis using Tukey-adjusted Wald z-tests for pairwise comparisons of estimated marginal means (EMMs) on the logit scale. As described above, all sugar concentrations greater than 0% had a significantly positive effect on ant presence compared to pure water (p < 0.0001).

However, comparisons between intermediate concentrations revealed that there was no significant difference between 5% sugar and 10% sugar (beta = -0.31, SE = 0.29, z = -1.35, p = 0.66), 5% sugar and 20% sugar (beta = -0.41, SE = 0.23, z = -1.81, p = 0.37), or 10% sugar and 20% sugar (beta = -0.11, SE = 0.22, z = -0.47, p = 0.99). In contrast, the 40% sugar concentration had significantly greater ant presence than all other concentrations: 5% sugar (beta = -1.07, SE = 0.23, z = -4.63, p <.0001), 10% sugar (beta = -0.76. SE = 0.23, z = -3.36, p = 0.007), and 20% sugar (beta = -0.65, SE = 0.23, z = -2.90, P = 0.03), indicating a nonlinear response to sugar concentration.

I can plot the predicted probabilities from the model and the results of the pairwise comparisons:

# get predicted probabilities (transforme log-odds)
tmp_emm_preds <- emmeans(model3_presence, ~ concentration, type = "response")

# Convert to a data frame for plotting
tmp_emm_preds_df <- as.data.frame(tmp_emm_preds)

# get pairwise contrasts
tmp_posthoc_results <- pairs(tmp_emm_preds, adjust = "tukey")

# convert to compact letter display  (CLD) format
tmp_cld_results <- multcomp::cld(tmp_emm_preds, Letters = letters)

# Merge with prediction data
tmp_emm_preds_df$letters <- tmp_cld_results$.group

(tmp_presence_pred_plot <- ggplot(tmp_emm_preds_df, aes(x = as.factor(concentration), y = prob)) +
  geom_point(size = 2) +   # Points for predicted probabilities
  geom_errorbar(aes(ymin = asymp.LCL, ymax = asymp.UCL), width = 0.1, size = 0.3) +  # 95% CI
  geom_text(aes(label = letters, y = prob), 
            nudge_x = 0.2,  # move label horizontally (increase to move it right)
            size = 4) +  # Set text size  
  labs(x = "Sugar concentration (%)", y = "Predicted proportion of baits occupied by ants") +
  theme_bw() +
  theme(axis.title.y = element_text(size = 10, margin = margin(r = 10), hjust = 0.5), # adjust distance from axis title to axis, center title
        axis.title.x = element_text(size = 10, margin = margin(t = 10))))

# combine both presence plots into 1 plot
(tmp_presence_plots <- tmp_raw_presence_plot + tmp_presence_pred_plot +
    plot_layout(nrow = 1) + plot_annotation(tag_levels = 'a', tag_suffix = ")"))  # Arrange in 1 row, adds a), b), c)

rm(list = ls(pattern = "^tmp_"))

2.2 Ant abundance

Now that I have tested whether sugar concentration affects the likelihood of ants being present at baits, I can look at the second part of the research question: when ants are present, do more ants recruit to the higher sugar concentrations ?

For this analysis, I will only look at experiments where at least one ant was present. For each experiment, I will only keep the rows for which the species total for that experiment is > 0 (i.e., that species was present on at least 1 bait of the experiment).

# remove experiments with no ants
sugar_analysis_abundance <- sugar_analysis %>%
  group_by(sample_ID) %>%  # group by experiment
  dplyr::filter(any(individuals > 0, na.rm = T)) %>%  # keep only exp where at least 1 individual is found
  ungroup()

# only keep rows of species that were present on the experiment
sugar_analysis_abundance <- sugar_analysis_abundance %>%
  filter(species_total_exp > 0)

The data frame now has 5 rows per species found on a given experiment (1 per concentration). I.e., for an experiment with 3 species found on it in total, there will be 1 row per species found per concentration, so 3 * 5 = 15 rows. An experiment were only 1 species was found will have 1 * 5 = 5 rows.

2.2.1 Exploratory data analysis
# Histogram of raw counts (individuals)
ggplot(na.omit(sugar_analysis_abundance), aes(x = individuals)) +
  geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
  labs(title = "Distribution of ant counts on baits", x = "Species-level ant count per bait", y = "Frequency") +
  theme_classic2() +
  theme(axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5), # adjust distance from axis title to axis, center title
        axis.title.x = element_text(margin = margin(t = 10)))

# number of individuals of each species vs sugar concentration
ggplot(na.omit(sugar_analysis_abundance), aes(x = as.factor(concentration), y = individuals)) +
  geom_boxplot() +
  labs(x = "Sugar concentration (%)", y = "Species-level number of individuals") +
  theme_classic2() +
  theme(axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5), # adjust distance from axis title to axis, center title
        axis.title.x = element_text(margin = margin(t = 10)))

# scaled ant activity per species vs concentration
(tmp_raw_abundance_plot <- ggplot(na.omit(sugar_analysis_abundance), aes(x = as.factor(concentration), y = scaled_species_total)) +
  geom_boxplot(size = 0.3) +
  labs(x = "Sugar concentration (%)", y = "Observed species-level proportion of individuals") +
  theme_bw() +
  theme(panel.grid.major = element_blank(),  # Remove major grid
        panel.grid.minor = element_blank(),
        axis.title.y = element_text(size = 10, margin = margin(r = 10), hjust = 0.5), # adjust distance from axis title to axis, center title
        axis.title.x = element_text(size = 10, margin = margin(t = 10))))

# for plotting overall (non species-specific) abundance, remove exp with 0 ants from sugar_analysis_overall
sugar_analysis_overall_abundance <- sugar_analysis_overall %>%
  group_by(sample_ID) %>%  # group by experiment
  dplyr::filter(any(individuals > 0, na.rm = T)) %>%  # keep only exp where at least 1 individual is found
  ungroup()

# number of individuals overall vs sugar concentration
ggplot(na.omit(sugar_analysis_overall_abundance), aes(x = as.factor(concentration), y = individuals)) +
  geom_boxplot() +
  labs(x = "Sugar concentration (%)", y = "Overall number of individuals") +
  theme_classic2() +
  theme(axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5), # adjust distance from axis title to axis, center title
        axis.title.x = element_text(margin = margin(t = 10)))

# scaled ant activity overall vs concentration
ggplot(na.omit(sugar_analysis_overall_abundance), aes(x = as.factor(concentration), y = scaled_ant_total)) +
  geom_boxplot() +
  labs(x = "Sugar concentration (%)", y = "Overall proportion of individuals") +
  theme_classic2() +
  theme(axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5), # adjust distance from axis title to axis, center title
        axis.title.x = element_text(margin = margin(t = 10)))

2.2.2 Model choice and model assumptions

I want to test the influence of sugar concentration on ant abundance. My response variable is based on count data: I look at the number of ants of one species present on one concentration of one experiment, offset by the total number of ants of that species on that experiment (i.e., the proportion of ants of one species on each bait of one experiment).

Since my response variable is a proportion, I can fit a binomial GLMM with the number of individuals of one species on one bait as a “success”, and the number of ants on all other baits (total number of ants of the experiments - number of ants on the bait in question) as a “failure”. If the data shows significant overdispersion, I can try to fit a beta-binomial GLMM.

The sugar concentration predictor can either be considered as a discrete numerical variable or a ordinal categorical variable. I also fit two models with these two options and check which model performs best.

# Binomial GLMM model with numerical concentration
tmp_model3_ab_num <- glmmTMB(cbind(individuals, species_total_exp - individuals) ~ 
                          concentration + (1 | site) + (1 | sp_abbr), 
                          family = binomial(link = "logit"),
                          data = sugar_analysis_abundance)

# Check for overdispersion with performance package
check_overdispersion(tmp_model3_ab_num)
## # Overdispersion test
## 
##  dispersion ratio =   1.538
##           p-value = < 0.001
## Overdispersion detected.
# check residuals with DHARMa package
tmp_model3_ab_sim <- simulateResiduals(fittedModel = tmp_model3_ab_num)
plot(tmp_model3_ab_sim)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

# Binomial GLMM model with categorical concentration
tmp_model3_ab_cat <- glmmTMB(cbind(individuals, species_total_exp - individuals) ~ 
                          as.factor(concentration) + (1 | site) + (1 | sp_abbr), 
                          family = binomial(link = "logit"),
                          data = sugar_analysis_abundance)

# Check for overdispersion with performance package
check_overdispersion(tmp_model3_ab_cat)
## # Overdispersion test
## 
##  dispersion ratio =   1.545
##           p-value = < 0.001
## Overdispersion detected.
# check residuals with DHARMa package
tmp_model3_ab_sim <- simulateResiduals(fittedModel = tmp_model3_ab_cat)
plot(tmp_model3_ab_sim)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

Both the overdispersion test from the performance package as well as the plots from the DHARMa package show overdisperion (with concentration as a factor or as a continuous variable). A binomial GLMM would thus not fit my data.

I try to fit a beta-binomial GLMM.

# Beta-binomial GLMM with numerical concentration
tmp_model3_ab_num <- glmmTMB(cbind(individuals, species_total_exp - individuals) ~ 
                               concentration + (1 | site) + (1 | sp_abbr), 
                               family = betabinomial,  # Beta-binomial family
                               data = sugar_analysis_abundance)

# show summary table to ensure the model has converged (no warnings)
summary(tmp_model3_ab_num)
##  Family: betabinomial  ( logit )
## Formula:          
## cbind(individuals, species_total_exp - individuals) ~ concentration +  
##     (1 | site) + (1 | sp_abbr)
## Data: sugar_analysis_abundance
## 
##      AIC      BIC   logLik deviance df.resid 
##   3056.6   3081.5  -1523.3   3046.6     1071 
## 
## Random effects:
## 
## Conditional model:
##  Groups  Name        Variance  Std.Dev. 
##  site    (Intercept) 1.188e-09 3.447e-05
##  sp_abbr (Intercept) 7.827e-11 8.847e-06
## Number of obs: 1076, groups:  site, 32; sp_abbr, 27
## 
## Dispersion parameter for betabinomial family (): 1.68 
## 
## Conditional model:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -2.226033   0.090701  -24.54   <2e-16 ***
## concentration  0.051403   0.003648   14.09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Check for overdispersion with performance package
check_overdispersion(tmp_model3_ab_num)
## # Overdispersion test
## 
##  dispersion ratio = 0.924
##           p-value = 0.792
## No overdispersion detected.
# check model assumptions and residuals with performance package
check_model(tmp_model3_ab_num, verbose = T)
## Not enough model terms in the conditional part of the model to check for
##   multicollinearity.
## `check_outliers()` does not yet support models of class `glmmTMB`.
## Using `ci_type = "gaussian"` because model is not bernoulli.

# check residuals with DHARMa package
tmp_model3_ab_sim <- simulateResiduals(fittedModel = tmp_model3_ab_num)
plot(tmp_model3_ab_sim)

# fit model with categorical concentration
tmp_model3_ab_cat <- glmmTMB(cbind(individuals, species_total_exp - individuals) ~ 
                               as.factor(concentration) + (1 | site) + (1 | sp_abbr), 
                               family = betabinomial,  # Beta-binomial family
                               data = sugar_analysis_abundance)

# show summary table to ensure the model has converged (no warnings)
summary(tmp_model3_ab_cat)
##  Family: betabinomial  ( logit )
## Formula:          
## cbind(individuals, species_total_exp - individuals) ~ as.factor(concentration) +  
##     (1 | site) + (1 | sp_abbr)
## Data: sugar_analysis_abundance
## 
##      AIC      BIC   logLik deviance df.resid 
##   3033.5   3073.3  -1508.7   3017.5     1068 
## 
## Random effects:
## 
## Conditional model:
##  Groups  Name        Variance  Std.Dev. 
##  site    (Intercept) 3.959e-10 1.990e-05
##  sp_abbr (Intercept) 5.966e-10 2.443e-05
## Number of obs: 1076, groups:  site, 32; sp_abbr, 27
## 
## Dispersion parameter for betabinomial family (): 1.74 
## 
## Conditional model:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -2.8967     0.1823 -15.893  < 2e-16 ***
## as.factor(concentration)5    0.9732     0.2186   4.453 8.48e-06 ***
## as.factor(concentration)10   1.4581     0.2118   6.883 5.84e-12 ***
## as.factor(concentration)20   1.9305     0.2113   9.138  < 2e-16 ***
## as.factor(concentration)40   2.5757     0.2080  12.381  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Check for overdispersion with performance package
check_overdispersion(tmp_model3_ab_cat)
## # Overdispersion test
## 
##  dispersion ratio = 0.970
##           p-value = 0.976
## No overdispersion detected.
# check model assumptions and residuals with performance package
check_model(tmp_model3_ab_cat, verbose = T)
## Not enough model terms in the conditional part of the model to check for
##   multicollinearity.
## `check_outliers()` does not yet support models of class `glmmTMB`.
## Using `ci_type = "gaussian"` because model is not bernoulli.

# check residuals with DHARMa package
tmp_model3_ab_sim <- simulateResiduals(fittedModel = tmp_model3_ab_cat)
plot(tmp_model3_ab_sim)

compare_performance(tmp_model3_ab_num, tmp_model3_ab_cat)
## Random effect variances not available. Returned R2 does not account for random effects.
## Random effect variances not available. Returned R2 does not account for random effects.
## # Comparison of Model Performance Indices
## 
## Name              |   Model |  AIC (weights) | AICc (weights) |  BIC (weights)
## ------------------------------------------------------------------------------
## tmp_model3_ab_num | glmmTMB | 3056.6 (<.001) | 3056.6 (<.001) | 3081.5 (0.017)
## tmp_model3_ab_cat | glmmTMB | 3033.5 (>.999) | 3033.6 (>.999) | 3073.3 (0.983)
## 
## Name              | R2 (cond.) | R2 (marg.) |  RMSE | Sigma | Log_loss
## ----------------------------------------------------------------------
## tmp_model3_ab_num |            |      0.379 | 0.314 | 1.682 |         
## tmp_model3_ab_cat |            |      0.471 | 0.313 | 1.736 |

The beta-binomial model shows no signs of overdispersion and the residuals vs predicted values plot does not show blatant or strong patterns in the residuals. A beta-binomial GLMM is an appropriate for my data.

When comparing model performance and model diagnostics plots, it appears that the model with concentration as a factor performs better, so that is the model I use for the analysis.

# final model
model3_abundance <- glmmTMB(cbind(individuals, species_total_exp - individuals) ~ 
                               as.factor(concentration) + (1 | site) + (1 | sp_abbr), 
                               family = betabinomial,  # Beta-binomial family
                               data = sugar_analysis_abundance)

# Check for overdispersion with performance package
check_overdispersion(model3_abundance)  # overdispersion
## # Overdispersion test
## 
##  dispersion ratio = 0.970
##           p-value = 0.976
## No overdispersion detected.
# check model assumptions with performance package
check_model(model3_abundance, verbose = T)
## Not enough model terms in the conditional part of the model to check for
##   multicollinearity.
## `check_outliers()` does not yet support models of class `glmmTMB`.
## Using `ci_type = "gaussian"` because model is not bernoulli.

# check residuals with DHARMa package
tmp_model3_ab_sim <- simulateResiduals(fittedModel = model3_abundance)
plot(tmp_model3_ab_sim)

The beta-binomial model shows no signs of overdispersion, the outlier test performed by the DHARMa package is not significant, and the residuals vs predicted values plot does not show blatant or strong patterns in the residuals. The diagnostics plots from the performance package show no clear deviation from model assumptions.

2.2.3 Results interpretation and visualization
# show model results
Anova(model3_abundance, type = "II")
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: cbind(individuals, species_total_exp - individuals)
##                           Chisq Df Pr(>Chisq)    
## as.factor(concentration) 195.72  4  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model3_abundance)
##  Family: betabinomial  ( logit )
## Formula:          
## cbind(individuals, species_total_exp - individuals) ~ as.factor(concentration) +  
##     (1 | site) + (1 | sp_abbr)
## Data: sugar_analysis_abundance
## 
##      AIC      BIC   logLik deviance df.resid 
##   3033.5   3073.3  -1508.7   3017.5     1068 
## 
## Random effects:
## 
## Conditional model:
##  Groups  Name        Variance  Std.Dev. 
##  site    (Intercept) 3.959e-10 1.990e-05
##  sp_abbr (Intercept) 5.966e-10 2.443e-05
## Number of obs: 1076, groups:  site, 32; sp_abbr, 27
## 
## Dispersion parameter for betabinomial family (): 1.74 
## 
## Conditional model:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -2.8967     0.1823 -15.893  < 2e-16 ***
## as.factor(concentration)5    0.9732     0.2186   4.453 8.48e-06 ***
## as.factor(concentration)10   1.4581     0.2118   6.883 5.84e-12 ***
## as.factor(concentration)20   1.9305     0.2113   9.138  < 2e-16 ***
## as.factor(concentration)40   2.5757     0.2080  12.381  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Convert model estimates to Odds Ratios
tmp_model_ab_or <- tidy(model3_abundance, effects = "fixed") %>%
  mutate(OR = exp(estimate),  # Convert log-odds to ORs
         OR_low = exp(estimate - 1.96 * std.error),  # Lower CI
         OR_high = exp(estimate + 1.96 * std.error)) # Upper CI

print(tmp_model_ab_or)
## # A tibble: 5 × 10
##   effect component term     estimate std.error statistic  p.value      OR OR_low
##   <chr>  <chr>     <chr>       <dbl>     <dbl>     <dbl>    <dbl>   <dbl>  <dbl>
## 1 fixed  cond      (Interc…   -2.90      0.182    -15.9  7.03e-57  0.0552 0.0386
## 2 fixed  cond      as.fact…    0.973     0.219      4.45 8.48e- 6  2.65   1.72  
## 3 fixed  cond      as.fact…    1.46      0.212      6.88 5.84e-12  4.30   2.84  
## 4 fixed  cond      as.fact…    1.93      0.211      9.14 6.38e-20  6.89   4.56  
## 5 fixed  cond      as.fact…    2.58      0.208     12.4  3.30e-35 13.1    8.74  
## # ℹ 1 more variable: OR_high <dbl>
# report model results
report(model3_abundance)
## Random effect variances not available. Returned R2 does not account for random effects.
## Can't calculate log-loss.
## Can't calculate proper scoring rules for models without integer response
##   values.
## Warning in text == "" | text2 == "": longer object length is not a multiple of
## shorter object length
## Warning in text == "" | text2 == "": longer object length is not a multiple of
## shorter object length
## Random effect variances not available. Returned R2 does not account for random effects.
## Can't calculate log-loss.
## Can't calculate proper scoring rules for models without integer response
##   values.
## We fitted a logistic mixed model (estimated using ML and nlminb optimizer) to
## predict cbind(individuals, species_total_exp - individuals) with concentration
## (formula: cbind(individuals, species_total_exp - individuals) ~
## as.factor(concentration)). The model included site as random effects (formula:
## list(~1 | site, ~1 | sp_abbr)). The model's explanatory power related to the
## fixed effects alone (marginal R2) is 0.47. The model's intercept, corresponding
## to concentration = 0, is at -2.90 (95% CI [-3.25, -2.54], p < .001). Within
## this model:
## 
##   - The effect of concentration [10] is statistically significant and positive
## (beta = 0.97, 95% CI [0.54, 1.40], p < .001; Std. beta = 1.46, 95% CI [1.04,
## 1.87])
##   - The effect of concentration [20] is statistically significant and positive
## (beta = 1.46, 95% CI [1.04, 1.87], p < .001; Std. beta = 1.93, 95% CI [1.52,
## 2.34])
##   - The effect of concentration [40] is statistically significant and positive
## (beta = 1.93, 95% CI [1.52, 2.34], p < .001; Std. beta = 2.58, 95% CI [2.17,
## 2.98])
##   - The intercept is statistically significant and positive (beta = 2.58, 95% CI
## [2.17, 2.98], p < .001; Std. beta = -2.90, 95% CI [-3.25, -2.54])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation. and We fitted a logistic
## mixed model (estimated using ML and nlminb optimizer) to predict
## cbind(individuals, species_total_exp - individuals) with concentration
## (formula: cbind(individuals, species_total_exp - individuals) ~
## as.factor(concentration)). The model included site as random effects (formula:
## list(~1 | site, ~1 | sp_abbr)). The model's explanatory power related to the
## fixed effects alone (marginal R2) is 0.47. The model's intercept, corresponding
## to concentration = 0, is at 1.74 (95% CI [1.48, 2.04]). Within this model:
## 
##   - The effect of concentration [10] is statistically significant and positive
## (beta = 0.97, 95% CI [0.54, 1.40], p < .001; Std. beta = 1.46, 95% CI [1.04,
## 1.87])
##   - The effect of concentration [20] is statistically significant and positive
## (beta = 1.46, 95% CI [1.04, 1.87], p < .001; Std. beta = 1.93, 95% CI [1.52,
## 2.34])
##   - The effect of concentration [40] is statistically significant and positive
## (beta = 1.93, 95% CI [1.52, 2.34], p < .001; Std. beta = 2.58, 95% CI [2.17,
## 2.98])
##   - The intercept is statistically significant and positive (beta = 2.58, 95% CI
## [2.17, 2.98], p < .001; Std. beta = -2.90, 95% CI [-3.25, -2.54])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.

I fitted a generalized linear mixed model (GLMM) with a beta-binomial distribution (estimated using ML and nlminb optimizer) to analyze the effect of sugar concentration on the species-level abundance (modeled as the species-specific proportion of ants on each sugar concentration bait) of foraging ants on baits (formula: cbind(individuals, species_total_exp - individuals) ~ as.factor(concentration)). The model included random intercepts for site (formula: ~ 1|site) as well as species (formula: ~ 1|sp_abbr). The model’s explanatory power related to the fixed effects alone (marginal R2) is 0.47. The very small variance of the random effects of site (variance = 3.959e-10) and species (variance = 5.966e-10) indicates that there is little site- or species-level variation in ant abundance once sugar concentration is accounted for.

The model fit yielded an AIC of 3033.5 and a BIC of 3073.3, with a log-likelihood of -1508.7 and a deviance of 3017.5. The dispersion parameter for the beta-binomial family was estimated to be 1.74, suggesting moderate dispersion in the data, which is appropriate for this family of distributions. The model included 1076 observations, with random effects for 32 sites and 27 species.

Sugar concentration shows a positive and highly significant effect on ant presence (Type II Wald Chi-square test, Chisq(4) = 195.72, p < 2.2e-16), showing that baits with sugar water increase the species-level abundance of ants on the baits compared to pure water baits. Compared to the reference level (0% sugar), species-level ant abundance increased significantly at all concentrations. More specifically, for each concentration (as obtained by Wald z-tests):
- The effect of 5% sugar is statistically significant and positive (beta = 0.97, SE = 0.22, z = 4.45, p = 8.48e-06)
- The effect of 10% sugar is statistically significant and positive (beta = 1.46, SE = 0.21,z = 6.88, p = 5.84e-12)
- The effect of 20% sugar is statistically significant and positive (beta = 1.93, SE = 0.21, z = 9.14, p < 2e-16)
- The effect of 40% sugar is statistically significant and positive (beta = 2.58, SE = 0.21, z = 12.38, p < 2e-16)

I can now run a post-hoc test to test for significant differences between the concentrations.

# compute pairwise comparisons between the concentrations
tmp_model3_ab_emm <- emmeans(model3_abundance, pairwise ~ concentration, adjust = "tukey")

print(tmp_model3_ab_emm)
## $emmeans
##  concentration emmean    SE  df asymp.LCL asymp.UCL
##              0 -2.897 0.182 Inf     -3.25    -2.540
##              5 -1.924 0.131 Inf     -2.18    -1.666
##             10 -1.439 0.117 Inf     -1.67    -1.209
##             20 -0.966 0.110 Inf     -1.18    -0.750
##             40 -0.321 0.102 Inf     -0.52    -0.122
## 
## Results are given on the logit (not the response) scale. 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast                          estimate    SE  df z.ratio p.value
##  concentration0 - concentration5     -0.973 0.219 Inf  -4.453  0.0001
##  concentration0 - concentration10    -1.458 0.212 Inf  -6.883  <.0001
##  concentration0 - concentration20    -1.931 0.211 Inf  -9.138  <.0001
##  concentration0 - concentration40    -2.576 0.208 Inf -12.381  <.0001
##  concentration5 - concentration10    -0.485 0.171 Inf  -2.841  0.0363
##  concentration5 - concentration20    -0.957 0.170 Inf  -5.644  <.0001
##  concentration5 - concentration40    -1.602 0.165 Inf  -9.686  <.0001
##  concentration10 - concentration20   -0.472 0.159 Inf  -2.967  0.0251
##  concentration10 - concentration40   -1.118 0.154 Inf  -7.239  <.0001
##  concentration20 - concentration40   -0.645 0.150 Inf  -4.306  0.0002
## 
## Results are given on the log odds ratio (not the response) scale. 
## P value adjustment: tukey method for comparing a family of 5 estimates

To test for pairwise differences between the different sugar concentration levels, I conducted a post-hoc analysis using Tukey-adjusted Wald z-tests for pairwise comparisons of estimated marginal means (EMMs) on the logit scale.

The estimated marginal means for each sugar concentration level are shown below. These results are given on the logit scale.

Concentration Emmean SE 95% CI Lower Bound 95% CI Upper Bound 0% -2.897 0.182 -3.25 -2.540 5% -1.924 0.131 -2.18 -1.666 10% -1.439 0.117 -1.67 -1.209 20% -0.966 0.110 -1.18 -0.750 40% -0.321 0.102 -0.52 -0.122

The pairwise contrasts for sugar concentration levels (with Tukey adjustment for multiple comparisons) are summarized below.

Contrast Estimate SE z-ratio p-value concentration 0% - 5% -0.973 0.219 -4.453 0.0001 concentration 0% - 10% -1.458 0.212 -6.883 <.0001 concentration 0% - 20% -1.931 0.211 -9.138 <.0001 concentration 0% - 40% -2.576 0.208 -12.381 <.0001 concentration 5% - 10% -0.485 0.171 -2.841 0.0363 concentration 5% - 20% -0.957 0.170 -5.644 <.0001 concentration 5% - 40% -1.602 0.165 -9.686 <.0001 concentration 10% - 20% -0.472 0.159 -2.967 0.0251 concentration 10% - 40% -1.118 0.154 -7.239 <.0001 concentration 20% - 40% -0.645 0.150 -4.306 0.0002

As described above, all sugar concentrations greater than 0% had a significantly positive effect on ant abundance compared to pure water (p < 0.0001). Additionally, all concentrations were significantly different from each other (see Table, p < 0.05).

I can plot the predicted probabilities from the model and the results of the pairwise comparisons:

# get predicted probabilities (transform log-odds)
tmp_emm_preds <- emmeans(model3_abundance, ~ concentration, type = "response")

# Convert to a data frame for plotting
tmp_emm_preds_df <- as.data.frame(tmp_emm_preds)

# get pairwise contrasts
tmp_posthoc_results <- pairs(tmp_emm_preds, adjust = "tukey")

# convert to compact letter display  (CLD) format
tmp_cld_results <- multcomp::cld(tmp_emm_preds, Letters = letters)

# Merge with prediction data
tmp_emm_preds_df$letters <- tmp_cld_results$.group

(tmp_pred_abundance_plot <- ggplot(tmp_emm_preds_df, aes(x = as.factor(concentration), y = prob)) +
  geom_point(size = 2) +   # Points for predicted probabilities
  geom_errorbar(aes(ymin = asymp.LCL, ymax = asymp.UCL), width = 0.1, size = 0.3) +  # 95% CI
  geom_text(aes(label = letters, y = prob, x = 1:5), 
            nudge_x = 0.25,  # move label horizontally (increase to move it right)
            size = 4) +  # Set text size  
  labs(x = "Sugar concentration (%)", y = "Predicted species-level proportion of individuals") +
  theme_bw() +
  theme(axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5), # adjust distance from axis title to axis, center title
        axis.title.x = element_text(margin = margin(t = 10)))
)

# combine both abundance plots into 1 plot
(tmp_abundance_plots <- tmp_raw_abundance_plot + tmp_pred_abundance_plot +
    plot_layout(nrow = 1) + plot_annotation(tag_levels = 'a', tag_suffix = ")"))  # Arrange in 1 row, adds a), b), c)

rm(list = ls(pattern = "^tmp_"))

3. Research Question 3

The overarching research question is: if ants can differentiate between baits of different sugar concentrations, how fast does that differentiation happen ? Do they immediately preferentially recruit to higher concentration baits, or does this preference establish over the time of the experiment ?

In other words, does the proportion of ants on the different sugar concentration baits largely stay the same throughout the experiment, or does the proportion of ants on higher concentrations increase during the 2 hours of the experiment ?

  • H0: the proportion of ants found on each sugar concentration bait is the same across all counting time points of the experiment
  • H1: the proportion ants found on each sugar concentration significantly changes over the time of the experiment, and the proportion of ants on higher sugar concentration increases with the counting time points.

Description of tested variables

  • Response variable:
    • Proportion of ants on each bait at each time point of an experiment, see section on scaling of ant totals (quantitative, from 0 to 1).
  • Explanatory variables:
    • the sugar concentration of the sugar-water solutions added to the baits (ordinal categorical)
    • the time points at which ants are counted, which represent time since experiment start (either ordinal categorical or discrete quantitative)
    • the interaction of sugar concentration ant time since experiment start
  • Random effect:
    • Site (nominal categorical)

3.1 Exploratory data analysis

I can find the number of experiments where no ant showed up at any time point. 210 experiments had at least 1 ant at 1 time point, meaning 239 - 210 = 29 experiments had no ants for the entire duration.

#remove experiments with no ants
tmp_speed_analysis_no_0 <- speed_analysis %>%
  group_by(sample_ID) %>%  # group by experiment
  dplyr::filter(any(individuals > 0, na.rm = T)) %>%  # keep only exp where at least 1 individual is found
  ungroup()

# number of exp with at least 1 ant over the duration of the experiment
length(unique(tmp_speed_analysis_no_0$sample_ID))
## [1] 210
# numbe rof experiments with no ants at any time
length(unique(speed_analysis$sample_ID)) - length(unique(tmp_speed_analysis_no_0$sample_ID))
## [1] 29

I can also plot the distribution of the number of ant individuals found on one bait at one time point. The histogram shows that the vast majority of baits at one time point had no ants on them.

# Histogram of ant counts per bait
ggplot(na.omit(speed_analysis), aes(x = individuals)) + 
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  labs(title = "Distribution of ant counts on baits at one time point", x = "Number of ants", y = "Frequency")

# Histogram of ant counts per bait, removing exp with 0 ants
ggplot(na.omit(tmp_speed_analysis_no_0), aes(x = individuals)) + 
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  labs(title = "Distribution of ant counts on baits at one time point \n(experiments with at least 1 ant present)", x = "Number of ants", y = "Frequency")

# plot ants per experiment across time
ggplot(na.omit(speed_analysis), aes(x = count_time, y = total_ants_time, group = sample_ID)) + 
  geom_line(aes(color = sample_ID), alpha = 0.7) + 
  geom_point() +
  labs(title = "Total ant counts over time", x = "Time since experiment start (minutes)", y = "Number of ants per experiment")+
  theme(legend.position = "none")

# plot ants per concentration over time
ggplot(na.omit(tmp_speed_analysis_no_0), aes(x = count_time, y = scaled_ants_time, color = concentration, group = concentration)) +
  #geom_point() + 
  geom_smooth(method = "glm", span = 0.9, se = F, size = 1.5)+
  scale_color_viridis_d() +  
  scale_x_continuous(breaks = c(5, 10, 20, 40, 80, 120), limits = c(0, 122))+
  theme_classic()+
  labs(x = "Time since experiment start (minutes)", y = "Proportion of ants", color = "Sugar concentration")
## `geom_smooth()` using formula = 'y ~ x'

3.2 Model choice and model assumptions

I want to test whether the influence of sugar concentration on ant abundance changes over the time of the experiment. My response variable is based on count data: I look at the number of ants present on one concentration at one counting time point, offset by the total number of ants on that experiment at that time point (i.e., the proportion of ants on each bait of one experiment at each counting time point).

Since my response variable is a proportion, I can fit a binomial GLMM with the number of individuals on one bait at one time as a “success”, and the number of ants on all other baits at that time points (total number of ants on the experiment - number of ants on the bait in question) as a “failure”. If the data shows significant overdispersion, I can try to fit a beta-binomial GLMM.

The sugar concentration predictor can either be considered as a discrete numerical variable or a ordinal categorical variable. The time variable can be considered a categorical variable (since the counting time points are not linearly distributed) or as a numerical variable (which can also be log-transformed to have a linearized time variable). I fit several models with these different options and check which model performs best.

# binomial GLMM with numerical time
tmp_model4_bin <- glmer(cbind(individuals, total_ants_time - individuals) ~ 
                        concentration * count_time + (1 | site),
                        family = binomial, 
                        data = na.omit(speed_analysis))
## boundary (singular) fit: see help('isSingular')
# check for overdispersion
check_overdispersion(tmp_model4_bin)
## # Overdispersion test
## 
##  dispersion ratio =   1.446
##           p-value = < 0.001
## Overdispersion detected.
# check model residuals
tmp_model4_simres <- simulateResiduals(tmp_model4_bin)
plot(tmp_model4_simres)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

The binomial GLMM shows overdispersion. I will try to fit a beta-binomial GLMM and compare the performance of both models.

# beta-binomial GLMM
tmp_model4_betabin <- glmmTMB(cbind(individuals, total_ants_time - individuals) ~ 
                              concentration * count_time + (1 | site), 
                              family = betabinomial(link = "logit"), 
                              data = na.omit(speed_analysis))

# check for overdispersion
check_overdispersion(tmp_model4_betabin)
## # Overdispersion test
## 
##  dispersion ratio = 0.894
##           p-value = 0.488
## No overdispersion detected.
# check model residuals with DHARMa package
tmp_model4_bb_simres <- simulateResiduals(tmp_model4_betabin)
plot(tmp_model4_bb_simres)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

# compare initial model performance with beta-binomial GLMM
compare_performance(tmp_model4_betabin, tmp_model4_bin)
## Random effect variances not available. Returned R2 does not account for random effects.
## boundary (singular) fit: see help('isSingular')
## Random effect variances not available. Returned R2 does not account for random effects.
## boundary (singular) fit: see help('isSingular')
## # Comparison of Model Performance Indices
## 
## Name               |    Model |   AIC (weights) |  AICc (weights)
## -----------------------------------------------------------------
## tmp_model4_betabin |  glmmTMB |  9499.1 (>.999) |  9499.2 (>.999)
## tmp_model4_bin     | glmerMod | 16568.1 (<.001) | 16568.2 (<.001)
## 
## Name               |   BIC (weights) | R2 (cond.) | R2 (marg.) |  RMSE | Sigma | Log_loss
## -----------------------------------------------------------------------------------------
## tmp_model4_betabin |  9581.2 (>.999) |            |      0.373 | 0.294 | 2.666 |         
## tmp_model4_bin     | 16643.4 (<.001) |            |      0.744 | 0.274 | 1.000 |

The beta-binomial model performs significantly better than the binomial GLMM, and will be used for further analysis.

The counting time can be modeled either as a categorical variable, as a numerical variable or as a log-transformed numerical variable. I model all three and compare model performance.

# beta-binomial GLMM with categorical count times
tmp_model4_betabin_cat <- glmmTMB(cbind(individuals, total_ants_time - individuals) ~ 
                              concentration * as.factor(count_time) + (1 | site), 
                              family = betabinomial(link = "logit"), 
                              data = na.omit(speed_analysis))

# check for overdispersion
check_overdispersion(tmp_model4_betabin_cat)
## # Overdispersion test
## 
##  dispersion ratio = 0.890
##           p-value = 0.552
## No overdispersion detected.
# check model residuals with DHARMa package
tmp_model4_bb_simres <- simulateResiduals(tmp_model4_betabin_cat)
plot(tmp_model4_bb_simres)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

# beta-binomial GLMM with log-transformed numerical count times
tmp_model4_betabin_log <- glmmTMB(cbind(individuals, total_ants_time - individuals) ~ 
                              concentration * log(count_time) + (1 | site), 
                              family = betabinomial(link = "logit"), 
                              data = na.omit(speed_analysis))

# check for overdispersion
check_overdispersion(tmp_model4_betabin_log)
## # Overdispersion test
## 
##  dispersion ratio = 0.895
##           p-value = 0.544
## No overdispersion detected.
# check model residuals with DHARMa package
tmp_model4_bb_simres <- simulateResiduals(tmp_model4_betabin_log)
plot(tmp_model4_bb_simres)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

# compare both model performances
compare_performance(tmp_model4_betabin, tmp_model4_betabin_cat, tmp_model4_betabin_log)
## Random effect variances not available. Returned R2 does not account for random effects.
## Random effect variances not available. Returned R2 does not account for random effects.
## Random effect variances not available. Returned R2 does not account for random effects.
## # Comparison of Model Performance Indices
## 
## Name                   |   Model |  AIC (weights) | AICc (weights)
## ------------------------------------------------------------------
## tmp_model4_betabin     | glmmTMB | 9499.1 (0.833) | 9499.2 (0.833)
## tmp_model4_betabin_cat | glmmTMB | 9530.2 (<.001) | 9530.5 (<.001)
## tmp_model4_betabin_log | glmmTMB | 9502.4 (0.167) | 9502.4 (0.167)
## 
## Name                   |  BIC (weights) | R2 (cond.) | R2 (marg.) |  RMSE
## -------------------------------------------------------------------------
## tmp_model4_betabin     | 9581.2 (0.833) |            |      0.373 | 0.294
## tmp_model4_betabin_cat | 9749.0 (<.001) |            |      0.382 | 0.294
## tmp_model4_betabin_log | 9584.4 (0.167) |            |      0.373 | 0.294
## 
## Name                   | Sigma | Log_loss
## -----------------------------------------
## tmp_model4_betabin     | 2.666 |         
## tmp_model4_betabin_cat | 2.675 |         
## tmp_model4_betabin_log | 2.671 |

The model with count time as a numerical variable performs much better than the categorical count time model and the log(time) model, so I continue the analysis with time as a numerical variable.

I also compare models where sugar concentration is modeled as a numerical variable and as a categorical variable.

# beta-binomial GLMM with numerical concentration
tmp_model4_betabin_num <- glmmTMB(cbind(individuals, total_ants_time - individuals) ~ 
                              as.numeric(as.character(concentration)) * count_time + (1 | site), 
                              family = betabinomial(link = "logit"), 
                              data = na.omit(speed_analysis))

# check for overdispersion
check_overdispersion(tmp_model4_betabin_num)
## # Overdispersion test
## 
##  dispersion ratio = 0.886
##           p-value =  0.52
## No overdispersion detected.
# check model residuals with DHARMa package
tmp_model4_bb_simres <- simulateResiduals(tmp_model4_betabin_num)
plot(tmp_model4_bb_simres)

# compare both model performances
compare_performance(tmp_model4_betabin, tmp_model4_betabin_num)
## Random effect variances not available. Returned R2 does not account for random effects.
## Random effect variances not available. Returned R2 does not account for random effects.
## # Comparison of Model Performance Indices
## 
## Name                   |   Model |  AIC (weights) | AICc (weights)
## ------------------------------------------------------------------
## tmp_model4_betabin     | glmmTMB | 9499.1 (>.999) | 9499.2 (>.999)
## tmp_model4_betabin_num | glmmTMB | 9576.6 (<.001) | 9576.6 (<.001)
## 
## Name                   |  BIC (weights) | R2 (cond.) | R2 (marg.) |  RMSE
## -------------------------------------------------------------------------
## tmp_model4_betabin     | 9581.2 (>.999) |            |      0.373 | 0.294
## tmp_model4_betabin_num | 9617.6 (<.001) |            |      0.290 | 0.295
## 
## Name                   | Sigma | Log_loss
## -----------------------------------------
## tmp_model4_betabin     | 2.666 |         
## tmp_model4_betabin_num | 2.517 |

The model with concentration as a categorical variable performs much better than the numerical concentration model, so I continue the analysis with concentration as a categorical variable.

# define final model
model4 <- glmmTMB(cbind(individuals, total_ants_time - individuals) ~ 
                              concentration * count_time + (1 | site), 
                              family = betabinomial(link = "logit"), 
                              data = na.omit(speed_analysis))

# check for overdispersion
check_overdispersion(model4)
## # Overdispersion test
## 
##  dispersion ratio = 0.894
##           p-value = 0.488
## No overdispersion detected.
# check model residuals with DHARMa package
tmp_model4_bb_simres <- simulateResiduals(model4)
plot(tmp_model4_bb_simres)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

The beta-binomial model shows no signs of overdispersion (tested with the performance package) and the residuals vs predicted values plot as well as the Q-Q plot from the DHARMa package do not show blatant or strong patterns in the residuals.

3.3 Results interpretation and visualization

The model is a beta-binomial GLMM.

# type III anova table
options(contrasts = c("contr.sum", "contr.poly"))  # specify sum-to-zero contrasts
Anova(model4, type = "III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: cbind(individuals, total_ants_time - individuals)
##                            Chisq Df Pr(>Chisq)    
## (Intercept)              237.544  1  < 2.2e-16 ***
## concentration             96.891  4  < 2.2e-16 ***
## count_time                15.609  1  7.789e-05 ***
## concentration:count_time  32.528  4  1.492e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# show model summary
summary(model4)
##  Family: betabinomial  ( logit )
## Formula:          
## cbind(individuals, total_ants_time - individuals) ~ concentration *  
##     count_time + (1 | site)
## Data: na.omit(speed_analysis)
## 
##      AIC      BIC   logLik deviance df.resid 
##   9499.1   9581.2  -4737.6   9475.1     6878 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance  Std.Dev. 
##  site   (Intercept) 7.902e-10 2.811e-05
## Number of obs: 6890, groups:  site, 39
## 
## Dispersion parameter for betabinomial family (): 2.67 
## 
## Conditional model:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -2.085425   0.135308 -15.412  < 2e-16 ***
## concentration5              0.360732   0.177800   2.029  0.04247 *  
## concentration10             0.368354   0.176698   2.085  0.03710 *  
## concentration20             1.154534   0.169624   6.806 1.00e-11 ***
## concentration40             1.257059   0.164890   7.624 2.47e-14 ***
## count_time                 -0.007682   0.001944  -3.951 7.79e-05 ***
## concentration5:count_time   0.006545   0.002447   2.675  0.00747 ** 
## concentration10:count_time  0.010798   0.002402   4.496 6.93e-06 ***
## concentration20:count_time  0.007082   0.002341   3.025  0.00249 ** 
## concentration40:count_time  0.011956   0.002297   5.205 1.94e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# report main model findings
report(model4)
## Random effect variances not available. Returned R2 does not account for random effects.
## Can't calculate log-loss.
## Can't calculate proper scoring rules for models without integer response
##   values.
## Warning in text == "" | text2 == "": longer object length is not a multiple of
## shorter object length
## Warning in text == "" | text2 == "": longer object length is not a multiple of
## shorter object length
## Random effect variances not available. Returned R2 does not account for random effects.
## Can't calculate log-loss.
## Can't calculate proper scoring rules for models without integer response
##   values.
## We fitted a logistic mixed model (estimated using ML and nlminb optimizer) to
## predict cbind(individuals, total_ants_time - individuals) with concentration
## and count_time (formula: cbind(individuals, total_ants_time - individuals) ~
## concentration * count_time). The model included site as random effect (formula:
## ~1 | site). The model's explanatory power related to the fixed effects alone
## (marginal R2) is 0.37. The model's intercept, corresponding to concentration =
## and count_time = 0, is at -2.09 (95% CI [-2.35, -1.82], p < .001). Within this
## model:
## 
##   - The effect of concentration [10] is statistically significant and negative
## (beta = -7.68e-03, 95% CI [-0.01, -3.87e-03], p < .001; Std. beta = -0.30, 95%
## CI [-0.42, -0.18])
##   - The effect of concentration [20] is statistically significant and positive
## (beta = 1.26, 95% CI [0.93, 1.58], p < .001; Std. beta = -0.10, 95% CI [-0.22,
## 0.02])
##   - The effect of concentration [40] is statistically significant and positive
## (beta = 0.36, 95% CI [0.01, 0.71], p = 0.042; Std. beta = 0.52, 95% CI [0.40,
## 0.63])
##   - The effect of count time is statistically significant and positive (beta =
## 0.37, 95% CI [0.02, 0.71], p = 0.037; Std. beta = -0.02, 95% CI [-0.07, 0.04])
##   - The effect of concentration [5] × count time is statistically significant and
## positive (beta = 1.15, 95% CI [0.82, 1.49], p < .001; Std. beta = -0.30, 95% CI
## [-0.43, -0.17])
##   - The effect of concentration [10] × count time is statistically significant
## and positive (beta = 7.08e-03, 95% CI [2.49e-03, 0.01], p = 0.002; Std. beta =
## -0.03, 95% CI [-0.14, 0.08])
##   - The effect of concentration [20] × count time is statistically significant
## and positive (beta = 0.01, 95% CI [7.45e-03, 0.02], p < .001; Std. beta = 0.15,
## 95% CI [0.04, 0.25])
##   - The effect of concentration [40] × count time is statistically significant
## and positive (beta = 6.54e-03, 95% CI [1.75e-03, 0.01], p = 0.007; Std. beta =
## -8.02e-03, 95% CI [-0.11, 0.09])
##   - The intercept is statistically significant and positive (beta = 0.01, 95% CI
## [6.09e-03, 0.02], p < .001; Std. beta = -1.48, 95% CI [-1.54, -1.41])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation. and We fitted a logistic
## mixed model (estimated using ML and nlminb optimizer) to predict
## cbind(individuals, total_ants_time - individuals) with concentration and
## count_time (formula: cbind(individuals, total_ants_time - individuals) ~
## concentration * count_time). The model included site as random effect (formula:
## ~1 | site). The model's explanatory power related to the fixed effects alone
## (marginal R2) is 0.37. The model's intercept, corresponding to concentration =
## and count_time = 0, is at 2.67 (95% CI [2.43, 2.93]). Within this model:
## 
##   - The effect of concentration [10] is statistically significant and negative
## (beta = -7.68e-03, 95% CI [-0.01, -3.87e-03], p < .001; Std. beta = -0.30, 95%
## CI [-0.42, -0.18])
##   - The effect of concentration [20] is statistically significant and positive
## (beta = 1.26, 95% CI [0.93, 1.58], p < .001; Std. beta = -0.10, 95% CI [-0.22,
## 0.02])
##   - The effect of concentration [40] is statistically significant and positive
## (beta = 0.36, 95% CI [0.01, 0.71], p = 0.042; Std. beta = 0.52, 95% CI [0.40,
## 0.63])
##   - The effect of count time is statistically significant and positive (beta =
## 0.37, 95% CI [0.02, 0.71], p = 0.037; Std. beta = -0.02, 95% CI [-0.07, 0.04])
##   - The effect of concentration [5] × count time is statistically significant and
## positive (beta = 1.15, 95% CI [0.82, 1.49], p < .001; Std. beta = -0.30, 95% CI
## [-0.43, -0.17])
##   - The effect of concentration [10] × count time is statistically significant
## and positive (beta = 7.08e-03, 95% CI [2.49e-03, 0.01], p = 0.002; Std. beta =
## -0.03, 95% CI [-0.14, 0.08])
##   - The effect of concentration [20] × count time is statistically significant
## and positive (beta = 0.01, 95% CI [7.45e-03, 0.02], p < .001; Std. beta = 0.15,
## 95% CI [0.04, 0.25])
##   - The effect of concentration [40] × count time is statistically significant
## and positive (beta = 6.54e-03, 95% CI [1.75e-03, 0.01], p = 0.007; Std. beta =
## -8.02e-03, 95% CI [-0.11, 0.09])
##   - The intercept is statistically significant and positive (beta = 0.01, 95% CI
## [6.09e-03, 0.02], p < .001; Std. beta = -1.48, 95% CI [-1.54, -1.41])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.

I fitted a generalized linear mixed model (GLMM) with a beta-binomial distribution (estimated using ML and nlminb optimizer) to analyze the effect of sugar concentration, time since experiment start and the interaction of time and sugar concentration on the abundance of foraging ants on baits (modeled as the proportion of ants on each sugar concentration bait at each counting time) (formula: cbind(individuals, total_ants_time - individuals) ~ concentration * count_time). The model included a random intercept for site (formula: ~ 1|site). The model’s explanatory power related to the fixed effects alone (marginal R2) is 0.37. The very small variance of the random effect (variance = 7.902e-10) indicates that there is little site-level variation in ant abundance once sugar concentration and counting time is accounted for.

The model fit yielded an AIC of 9499.1 and a BIC of 9581.2, with a log-likelihood of -4737.6 and a deviance of 9475.1. The dispersion parameter for the beta-binomial family was estimated to be 2.67, which is appropriate for this family of distributions. The model included 6890 observations, with random effects for 39 sites.

The type III Wald chi-square tests from the analysis of deviance table indicate a highly significant effect of sugar concentration (Chisq(4) = 96.891, p < 2.2e-16), time since experiment start (Chisq(1) = 15.609, p = 7.789e-05), and of the interaction of sugar concentration and time since experiment time (Chisq(4) = 32.528, p = 1.492e-06) on ant foraging, suggesting that the proportion of ants on each sugar concentration was not uniform and that it changed over the course of the experiment.

I can now run a post-hoc analysis to look at the specific slope of the proportion of ants on each sugar concentration bait across experiment time, as well as to test whether there are significant differences in the slopes between concentrations.

I also run a post-hoc analysis to compare the predicted proportion of ants on each sugar concentrations at the first count time since the of the experiment (count_time = 5), i.e. to test whether there are immediate preferences for specific concentrations.

# Get estimated slopes (trends) of count_time for each concentration
tmp_slope_comparisons <- emtrends(model4, pairwise ~ concentration, var = "count_time", adjust = "tukey")

# Display results
summary(tmp_slope_comparisons, test = T)
## $emtrends
##  concentration count_time.trend      SE  df asymp.LCL asymp.UCL
##  0                     -0.00768 0.00194 Inf -0.011493  -0.00387
##  5                     -0.00114 0.00149 Inf -0.004054   0.00178
##  10                     0.00312 0.00141 Inf  0.000349   0.00588
##  20                    -0.00060 0.00131 Inf -0.003164   0.00196
##  40                     0.00427 0.00122 Inf  0.001881   0.00667
## 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast                           estimate      SE  df z.ratio p.value
##  concentration0 - concentration5   -0.006545 0.00245 Inf  -2.675  0.0577
##  concentration0 - concentration10  -0.010798 0.00240 Inf  -4.496  0.0001
##  concentration0 - concentration20  -0.007082 0.00234 Inf  -3.025  0.0210
##  concentration0 - concentration40  -0.011956 0.00230 Inf  -5.205  <.0001
##  concentration5 - concentration10  -0.004253 0.00205 Inf  -2.075  0.2311
##  concentration5 - concentration20  -0.000537 0.00198 Inf  -0.271  0.9988
##  concentration5 - concentration40  -0.005411 0.00193 Inf  -2.810  0.0398
##  concentration10 - concentration20  0.003716 0.00192 Inf   1.933  0.2999
##  concentration10 - concentration40 -0.001158 0.00187 Inf  -0.620  0.9719
##  concentration20 - concentration40 -0.004875 0.00179 Inf  -2.722  0.0508
## 
## P value adjustment: tukey method for comparing a family of 5 estimates
# Use emmeans to get marginal means for concentrations at count_time = 5
tmp_emmeans_results <- emmeans(model4, ~ concentration | count_time, at = list(count_time = 5))

# Perform pairwise contrasts between concentrations at count_time = 5
tmp_contrast_results <- contrast(tmp_emmeans_results, method = "pairwise")

# Display the results
summary(tmp_contrast_results)
## count_time = 5:
##  contrast                          estimate    SE  df z.ratio p.value
##  concentration0 - concentration5    -0.3935 0.168 Inf  -2.341  0.1321
##  concentration0 - concentration10   -0.4223 0.167 Inf  -2.527  0.0847
##  concentration0 - concentration20   -1.1899 0.160 Inf  -7.419  <.0001
##  concentration0 - concentration40   -1.3168 0.156 Inf  -8.450  <.0001
##  concentration5 - concentration10   -0.0289 0.155 Inf  -0.187  0.9997
##  concentration5 - concentration20   -0.7965 0.147 Inf  -5.414  <.0001
##  concentration5 - concentration40   -0.9234 0.142 Inf  -6.486  <.0001
##  concentration10 - concentration20  -0.7676 0.146 Inf  -5.268  <.0001
##  concentration10 - concentration40  -0.8945 0.141 Inf  -6.340  <.0001
##  concentration20 - concentration40  -0.1269 0.132 Inf  -0.958  0.8738
## 
## Results are given on the log odds ratio (not the response) scale. 
## P value adjustment: tukey method for comparing a family of 5 estimates

To compare the effect of time since experiment start on ant foraging activity across the different sugar concentrations, I conducted a post-hoc analysis using Tukey-adjusted Wald z-tests for pairwise comparisons of the estimated slopes (trends) of the relationship between time since experiment time and ant foraging activity for each sugar concentration on the logit scale.

The estimated slopes of the relationship between time and ant foraging activity for each sugar concentration level are shown below. These results are given on the logit scale.

Concentration Slope (ant activity vs time) SE 95% CI Lower Bound 95% CI Upper Bound 0% -0.00768 0.00194 -0.011493 -0.00387 5% -0.00114 0.00149 -0.004054 0.00178 10% 0.00312 0.00141 0.000349 0.00588 20% -0.00060 0.00131 -0.003164 0.00196 40% 0.00427 0.00122 0.001881 0.00667

The table shows that on baits with 0% sugar, the proportion of ants on the bait decreases across the time of the experiment (slope = -0.00768, 95% CI = [-0.011, -0.004]). On baits with 5% sugar (slope = -0.001, 95% CI = [-0.004, 0.002]) and 20% sugar (slope = -0.0006, 95% CI = [-0.003, 0.002]), the proportion of ants on the baits does not significantly change over the time of the experiment, as shown by slopes very close to 0 and 95% confidence intervals overlapping with 0. Baits with 10% sugar (slope = 0.003, 95% CI = [0.0003, 0.006]) and 40% sugar (slope = 0.004, 95% CI = [0.002, 0.007]) show an increase in the proportion of ants present on the baits over the time of the experiment.

The pairwise contrasts for sugar concentration levels (with Tukey adjustment for multiple comparisons) are summarized below.

Contrast Estimate SE z-ratio p-value concentration 0% - 5% -0.006545 0.00245 -2.675 0.0577 . concentration 0% - 10% -0.010798 0.00240 -4.496 0.0001 concentration 0% - 20% -0.007082 0.00234 -3.025 0.0210 concentration 0% - 40% -0.011956 0.00230 -5.205 <.0001 concentration 5% - 10% -0.004253 0.00205 -2.075 0.2311 concentration 5% - 20% -0.000537 0.00198 -0.271 0.9988 concentration 5% - 40% -0.005411 0.00193 -2.810 0.0398 concentration 10% - 20% 0.003716 0.00192 1.933 0.2999 concentration 10% - 40% -0.001158 0.00187 -0.620 0.9719 concentration 20% - 40% -0.004875 0.00179 -2.722 0.0508 .

The relationship between time since experiment start and ant activity is only marginally different on 5% baits form 0% baits (see table, p = 0.0577). The slope for 10%, 20%, and 40% baits is significantly different from the slope of 0% baits (see table, p < 0.05). This suggests that ant activity changed at different rates over the course of the experiment on low concentration baits (0%, 5%) than on higher concentration baits (10%, 20%, 40%). There is a significant difference in the slope on 5% baits and 40% baits (see table, p = 0.0398), as well as a marginally significant difference between the slopes on 20% baits and 40% baits (see table, p = 0.0508). However, the differences between 5%, 10%, and 20% are not significant (see table, p > 0.05), suggesting that the trends in the proportion of ants on these baits is similar over the time of the experiment.

To test whether there is a significant difference in the predicted proportion of ants at the start of the experiment (0 minutes since experiment start), I conducted a post-hoc analysis using Tukey-adjusted Wald z-tests for pairwise comparisons of the estimated marginal means (EMMs) for each sugar concentration while holding count_time constant at 5.

The estimated marginal means for each sugar concentration level at 5 minutes since experiment start are shown below. These results are given on the logit scale.

Contrast Estimate SE z-ratio p-value concentration 0% - 5% -0.36073 0.178 -2.029 0.2521 concentration 0% - 10% -0.36835 0.177 -2.085 0.2266 concentration 0% - 20% -1.15453 0.170 -6.806 <.0001 concentration 0% - 40% -1.25706 0.165 -7.624 <.0001 concentration 5% - 10% -0.00762 0.163 -0.047 1.0000 concentration 5% - 20% -0.79380 0.155 -5.114 <.0001 concentration 5% - 40% -0.89633 0.150 -5.966 <.0001 concentration 10% - 20% -0.78618 0.154 -5.117 <.0001 concentration 10% - 40% -0.88871 0.149 -5.973 <.0001 concentration 20% - 40% -0.10253 0.140 -0.733 0.9487

As can be seen on the table, there is no significant difference in the predicted proportion of ants at the start of the experiment between the baits with 0%, 5%, and 10% sugar (see table, p > 0.05). Similarly, there is no significant difference in the predicted proportion of ants on the 20% and 40% baits at the start of the experiment (see table, p = 0.95). However, there is a highly significant difference in the predicted proportion of ants at the start of the experiment between these two groups of baits (0%, 5%, 10% vs. 20%, 40%) (see table, p < 0.0001). This shows that according to the model’s predictions, from the start of the experiment there is a higher proportion of ants on the baits with the highest sugar concentrations (20% sugar and 40% sugar) than on the baits with lower sugar concentrations (0% sugar, 5% sugar, and 10% sugar).

I can plot the predicted probabilities from the model:

# define the counting times over which to generate predictions
tmp_count_time_seq <- c(5, 10, 20, 40, 80, 120)

# Use emmeans() to predict over this sequence of count_time values, while keeping concentration as a factor
tmp_emm_time_preds <- emmeans(model4, ~ count_time | concentration, at = list(count_time = tmp_count_time_seq), type = "response")

# Convert the predictions to a dataframe
tmp_emm_time_df <- as.data.frame(tmp_emm_time_preds)

# Plot using ggplot2
(tmp_sugar_vs_time_plot <- ggplot(tmp_emm_time_df, aes(x = count_time, y = prob, color = factor(concentration))) + 
  geom_line() + 
  geom_point(size = 2) + 
  geom_errorbar(aes(ymin = asymp.LCL, ymax = asymp.UCL), width = 0.2) +  # 95% CI
  labs(title = "", 
       x = "Time since experiment start (minutes)", 
       y = "Predicted proportion of individuals", 
       color = "Sugar concentration (%)") + 
  scale_color_viridis_d() +
  scale_x_continuous(breaks = c(5, 10, 20, 40, 80, 120), limits = c(0, 122))+
  theme_bw() +
  theme(panel.grid.major = element_blank(),  # Remove major grid
        panel.grid.minor = element_blank(),
        axis.title.y = element_text(size = 10, margin = margin(r = 10), hjust = 0.5), # adjust distance from axis title to axis, center title
        axis.title.x = element_text(size = 10, margin = margin(t = 10)),
        legend.title = element_text(size = 9, hjust = 0.5),
        legend.text = element_text(size = 8))
)

rm(list = ls(pattern = "^tmp_"))